Blame - marvell/linux/drivers/md/dm-user.c - T108

blob: a1d0f1ad2bf6fdb3b52571e11ffd8db73dec465a [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0+
				2	/*
				3	* Copyright (C) 2020 Google, Inc
				4	* Copyright (C) 2020 Palmer Dabbelt <palmerdabbelt@google.com>
				5	*/
				6
				7	#include <linux/device-mapper.h>
				8	#include <uapi/linux/dm-user.h>
				9
				10	#include <linux/bio.h>
				11	#include <linux/init.h>
				12	#include <linux/mempool.h>
				13	#include <linux/miscdevice.h>
				14	#include <linux/module.h>
				15	#include <linux/poll.h>
				16	#include <linux/uio.h>
				17	#include <linux/wait.h>
				18	#include <linux/workqueue.h>
				19
				20	#define DM_MSG_PREFIX "user"
				21
				22	#define MAX_OUTSTANDING_MESSAGES 128
				23
				24	static unsigned int daemon_timeout_msec = 4000;
				25	module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint,
				26	0644);
				27	MODULE_PARM_DESC(dm_user_daemon_timeout_msec,
				28	"IO Timeout in msec if daemon does not process");
				29
				30	/*
				31	* dm-user uses four structures:
				32	*
				33	* - "struct target", the outermost structure, corresponds to a single device
				34	* mapper target. This contains the set of outstanding BIOs that have been
				35	* provided by DM and are not actively being processed by the user, along
				36	* with a misc device that userspace can open to communicate with the
				37	* kernel. Each time userspaces opens the misc device a new channel is
				38	* created.
				39	* - "struct channel", which represents a single active communication channel
				40	* with userspace. Userspace may choose arbitrary read/write sizes to use
				41	* when processing messages, channels form these into logical accesses.
				42	* When userspace responds to a full message the channel completes the BIO
				43	* and obtains a new message to process from the target.
				44	* - "struct message", which wraps a BIO with the additional information
				45	* required by the kernel to sort out what to do with BIOs when they return
				46	* from userspace.
				47	* - "struct dm_user_message", which is the exact message format that
				48	* userspace sees.
				49	*
				50	* The hot path contains three distinct operations:
				51	*
				52	* - user_map(), which is provided a BIO from device mapper that is queued
				53	* into the target. This allocates and enqueues a new message.
				54	* - dev_read(), which dequeues a message, copies it to userspace.
				55	* - dev_write(), which looks up a message (keyed by sequence number) and
				56	* completes the corresponding BIO.
				57	*
				58	* Lock ordering (outer to inner)
				59	*
				60	* 1) miscdevice's global lock. This is held around dev_open, so it has to be
				61	* the outermost lock.
				62	* 2) target->lock
				63	* 3) channel->lock
				64	*/
				65
				66	struct message {
				67	/*
				68	* Messages themselves do not need a lock, they're protected by either
				69	* the target or channel's lock, depending on which can reference them
				70	* directly.
				71	*/
				72	struct dm_user_message msg;
				73	struct bio *bio;
				74	size_t posn_to_user;
				75	size_t total_to_user;
				76	size_t posn_from_user;
				77	size_t total_from_user;
				78
				79	struct list_head from_user;
				80	struct list_head to_user;
				81
				82	/*
				83	* These are written back from the user. They live in the same spot in
				84	* the message, but we need to either keep the old values around or
				85	* call a bunch more BIO helpers. These are only valid after write has
				86	* adopted the message.
				87	*/
				88	u64 return_type;
				89	u64 return_flags;
				90
				91	struct delayed_work work;
				92	bool delayed;
				93	struct target *t;
				94	};
				95
				96	struct target {
				97	/*
				98	* A target has a single lock, which protects everything in the target
				99	* (but does not protect the channels associated with a target).
				100	*/
				101	struct mutex lock;
				102
				103	/*
				104	* There is only one point at which anything blocks: userspace blocks
				105	* reading a new message, which is woken up by device mapper providing
				106	* a new BIO to process (or tearing down the target). The
				107	* corresponding write side doesn't block, instead we treat userspace's
				108	* response containing a message that has yet to be mapped as an
				109	* invalid operation.
				110	*/
				111	struct wait_queue_head wq;
				112
				113	/*
				114	* Messages are delivered to userspace in order, but may be returned
				115	* out of order. This allows userspace to schedule IO if it wants to.
				116	*/
				117	mempool_t message_pool;
				118	u64 next_seq_to_map;
				119	u64 next_seq_to_user;
				120	struct list_head to_user;
				121
				122	/*
				123	* There is a misc device per target. The name is selected by
				124	* userspace (via a DM create ioctl argument), and each ends up in
				125	* /dev/dm-user/. It looks like a better way to do this may be to have
				126	* a filesystem to manage these, but this was more expedient. The
				127	* current mechanism is functional, but does result in an arbitrary
				128	* number of dynamically created misc devices.
				129	*/
				130	struct miscdevice miscdev;
				131
				132	/*
				133	* Device mapper's target destructor triggers tearing this all down,
				134	* but we can't actually free until every channel associated with this
				135	* target has been destroyed. Channels each have a reference to their
				136	* target, and there is an additional single reference that corresponds
				137	* to both DM and the misc device (both of which are destroyed by DM).
				138	*
				139	* In the common case userspace will be asleep waiting for a new
				140	* message when device mapper decides to destroy the target, which
				141	* means no new messages will appear. The destroyed flag triggers a
				142	* wakeup, which will end up removing the reference.
				143	*/
				144	struct kref references;
				145	int dm_destroyed;
				146	bool daemon_terminated;
				147	};
				148
				149	struct channel {
				150	struct target *target;
				151
				152	/*
				153	* A channel has a single lock, which prevents multiple reads (or
				154	* multiple writes) from conflicting with each other.
				155	*/
				156	struct mutex lock;
				157
				158	struct message *cur_to_user;
				159	struct message *cur_from_user;
				160	ssize_t to_user_error;
				161	ssize_t from_user_error;
				162
				163	/*
				164	* Once a message has been forwarded to userspace on a channel it must
				165	* be responded to on the same channel. This allows us to error out
				166	* the messages that have not yet been responded to by a channel when
				167	* that channel closes, which makes handling errors more reasonable for
				168	* fault-tolerant userspace daemons. It also happens to make avoiding
				169	* shared locks between user_map() and dev_read() a lot easier.
				170	*
				171	* This does preclude a multi-threaded work stealing userspace
				172	* implementation (or at least, force a degree of head-of-line blocking
				173	* on the response path).
				174	*/
				175	struct list_head from_user;
				176
				177	/*
				178	* Responses from userspace can arrive in arbitrarily small chunks.
				179	* We need some place to buffer one up until we can find the
				180	* corresponding kernel-side message to continue processing, so instead
				181	* of allocating them we just keep one off to the side here. This can
				182	* only ever be pointer to by from_user_cur, and will never have a BIO.
				183	*/
				184	struct message scratch_message_from_user;
				185	};
				186
				187	static void message_kill(struct message m, mempool_t pool)
				188	{
				189	m->bio->bi_status = BLK_STS_IOERR;
				190	bio_endio(m->bio);
				191	bio_put(m->bio);
				192	mempool_free(m, pool);
				193	}
				194
				195	static inline bool is_user_space_thread_present(struct target *t)
				196	{
				197	lockdep_assert_held(&t->lock);
				198	return (kref_read(&t->references) > 1);
				199	}
				200
				201	static void process_delayed_work(struct work_struct *work)
				202	{
				203	struct delayed_work *del_work = to_delayed_work(work);
				204	struct message *msg = container_of(del_work, struct message, work);
				205
				206	struct target *t = msg->t;
				207
				208	mutex_lock(&t->lock);
				209
				210	/*
				211	* There is a atleast one thread to process the IO.
				212	*/
				213	if (is_user_space_thread_present(t)) {
				214	mutex_unlock(&t->lock);
				215	return;
				216	}
				217
				218	/*
				219	* Terminate the IO with an error
				220	*/
				221	list_del(&msg->to_user);
				222	pr_err("I/O error: sector %llu: no user-space daemon for %s target\n",
				223	msg->bio->bi_iter.bi_sector,
				224	t->miscdev.name);
				225	message_kill(msg, &t->message_pool);
				226	mutex_unlock(&t->lock);
				227	}
				228
				229	static void enqueue_delayed_work(struct message *m, bool is_delay)
				230	{
				231	unsigned long delay = 0;
				232
				233	m->delayed = true;
				234	INIT_DELAYED_WORK(&m->work, process_delayed_work);
				235
				236	/*
				237	* Snapuserd daemon is the user-space process
				238	* which processes IO request from dm-user
				239	* when OTA is applied. Per the current design,
				240	* when a dm-user target is created, daemon
				241	* attaches to target and starts processing
				242	* the IO's. Daemon is terminated only when
				243	* dm-user target is destroyed.
				244	*
				245	* If for some reason, daemon crashes or terminates early,
				246	* without destroying the dm-user target; then
				247	* there is no mechanism to restart the daemon
				248	* and start processing the IO's from the same target.
				249	* Theoretically, it is possible but that infrastructure
				250	* doesn't exist in the android ecosystem.
				251	*
				252	* Thus, when the daemon terminates, there is no way the IO's
				253	* issued on that target will be processed. Hence,
				254	* we set the delay to 0 and fail the IO's immediately.
				255	*
				256	* On the other hand, when a new dm-user target is created,
				257	* we wait for the daemon to get attached for the first time.
				258	* This primarily happens when init first stage spins up
				259	* the daemon. At this point, since the snapshot device is mounted
				260	* of a root filesystem, dm-user target may receive IO request
				261	* even though daemon is not fully launched. We don't want
				262	* to fail those IO requests immediately. Thus, we queue these
				263	* requests with a timeout so that daemon is ready to process
				264	* those IO requests. Again, if the daemon fails to launch within
				265	* the timeout period, then IO's will be failed.
				266	*/
				267	if (is_delay)
				268	delay = msecs_to_jiffies(daemon_timeout_msec);
				269
				270	queue_delayed_work(system_wq, &m->work, delay);
				271	}
				272
				273	static inline struct target target_from_target(struct dm_target target)
				274	{
				275	WARN_ON(target->private == NULL);
				276	return target->private;
				277	}
				278
				279	static inline struct target target_from_miscdev(struct miscdevice miscdev)
				280	{
				281	return container_of(miscdev, struct target, miscdev);
				282	}
				283
				284	static inline struct channel channel_from_file(struct file file)
				285	{
				286	WARN_ON(file->private_data == NULL);
				287	return file->private_data;
				288	}
				289
				290	static inline struct target target_from_channel(struct channel c)
				291	{
				292	WARN_ON(c->target == NULL);
				293	return c->target;
				294	}
				295
				296	static inline size_t bio_size(struct bio *bio)
				297	{
				298	struct bio_vec bvec;
				299	struct bvec_iter iter;
				300	size_t out = 0;
				301
				302	bio_for_each_segment (bvec, bio, iter)
				303	out += bio_iter_len(bio, iter);
				304	return out;
				305	}
				306
				307	static inline size_t bio_bytes_needed_to_user(struct bio *bio)
				308	{
				309	switch (bio_op(bio)) {
				310	case REQ_OP_WRITE:
				311	return sizeof(struct dm_user_message) + bio_size(bio);
				312	case REQ_OP_READ:
				313	case REQ_OP_FLUSH:
				314	case REQ_OP_DISCARD:
				315	case REQ_OP_SECURE_ERASE:
				316	case REQ_OP_WRITE_SAME:
				317	case REQ_OP_WRITE_ZEROES:
				318	return sizeof(struct dm_user_message);
				319
				320	/*
				321	* These ops are not passed to userspace under the assumption that
				322	* they're not going to be particularly useful in that context.
				323	*/
				324	default:
				325	return -EOPNOTSUPP;
				326	}
				327	}
				328
				329	static inline size_t bio_bytes_needed_from_user(struct bio *bio)
				330	{
				331	switch (bio_op(bio)) {
				332	case REQ_OP_READ:
				333	return sizeof(struct dm_user_message) + bio_size(bio);
				334	case REQ_OP_WRITE:
				335	case REQ_OP_FLUSH:
				336	case REQ_OP_DISCARD:
				337	case REQ_OP_SECURE_ERASE:
				338	case REQ_OP_WRITE_SAME:
				339	case REQ_OP_WRITE_ZEROES:
				340	return sizeof(struct dm_user_message);
				341
				342	/*
				343	* These ops are not passed to userspace under the assumption that
				344	* they're not going to be particularly useful in that context.
				345	*/
				346	default:
				347	return -EOPNOTSUPP;
				348	}
				349	}
				350
				351	static inline long bio_type_to_user_type(struct bio *bio)
				352	{
				353	switch (bio_op(bio)) {
				354	case REQ_OP_READ:
				355	return DM_USER_REQ_MAP_READ;
				356	case REQ_OP_WRITE:
				357	return DM_USER_REQ_MAP_WRITE;
				358	case REQ_OP_FLUSH:
				359	return DM_USER_REQ_MAP_FLUSH;
				360	case REQ_OP_DISCARD:
				361	return DM_USER_REQ_MAP_DISCARD;
				362	case REQ_OP_SECURE_ERASE:
				363	return DM_USER_REQ_MAP_SECURE_ERASE;
				364	case REQ_OP_WRITE_SAME:
				365	return DM_USER_REQ_MAP_WRITE_SAME;
				366	case REQ_OP_WRITE_ZEROES:
				367	return DM_USER_REQ_MAP_WRITE_ZEROES;
				368
				369	/*
				370	* These ops are not passed to userspace under the assumption that
				371	* they're not going to be particularly useful in that context.
				372	*/
				373	default:
				374	return -EOPNOTSUPP;
				375	}
				376	}
				377
				378	static inline long bio_flags_to_user_flags(struct bio *bio)
				379	{
				380	u64 out = 0;
				381	typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK;
				382
				383	if (opf & REQ_FAILFAST_DEV) {
				384	opf &= ~REQ_FAILFAST_DEV;
				385	out \|= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV;
				386	}
				387
				388	if (opf & REQ_FAILFAST_TRANSPORT) {
				389	opf &= ~REQ_FAILFAST_TRANSPORT;
				390	out \|= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT;
				391	}
				392
				393	if (opf & REQ_FAILFAST_DRIVER) {
				394	opf &= ~REQ_FAILFAST_DRIVER;
				395	out \|= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER;
				396	}
				397
				398	if (opf & REQ_SYNC) {
				399	opf &= ~REQ_SYNC;
				400	out \|= DM_USER_REQ_MAP_FLAG_SYNC;
				401	}
				402
				403	if (opf & REQ_META) {
				404	opf &= ~REQ_META;
				405	out \|= DM_USER_REQ_MAP_FLAG_META;
				406	}
				407
				408	if (opf & REQ_PRIO) {
				409	opf &= ~REQ_PRIO;
				410	out \|= DM_USER_REQ_MAP_FLAG_PRIO;
				411	}
				412
				413	if (opf & REQ_NOMERGE) {
				414	opf &= ~REQ_NOMERGE;
				415	out \|= DM_USER_REQ_MAP_FLAG_NOMERGE;
				416	}
				417
				418	if (opf & REQ_IDLE) {
				419	opf &= ~REQ_IDLE;
				420	out \|= DM_USER_REQ_MAP_FLAG_IDLE;
				421	}
				422
				423	if (opf & REQ_INTEGRITY) {
				424	opf &= ~REQ_INTEGRITY;
				425	out \|= DM_USER_REQ_MAP_FLAG_INTEGRITY;
				426	}
				427
				428	if (opf & REQ_FUA) {
				429	opf &= ~REQ_FUA;
				430	out \|= DM_USER_REQ_MAP_FLAG_FUA;
				431	}
				432
				433	if (opf & REQ_PREFLUSH) {
				434	opf &= ~REQ_PREFLUSH;
				435	out \|= DM_USER_REQ_MAP_FLAG_PREFLUSH;
				436	}
				437
				438	if (opf & REQ_RAHEAD) {
				439	opf &= ~REQ_RAHEAD;
				440	out \|= DM_USER_REQ_MAP_FLAG_RAHEAD;
				441	}
				442
				443	if (opf & REQ_BACKGROUND) {
				444	opf &= ~REQ_BACKGROUND;
				445	out \|= DM_USER_REQ_MAP_FLAG_BACKGROUND;
				446	}
				447
				448	if (opf & REQ_NOWAIT) {
				449	opf &= ~REQ_NOWAIT;
				450	out \|= DM_USER_REQ_MAP_FLAG_NOWAIT;
				451	}
				452
				453	if (opf & REQ_NOUNMAP) {
				454	opf &= ~REQ_NOUNMAP;
				455	out \|= DM_USER_REQ_MAP_FLAG_NOUNMAP;
				456	}
				457
				458	if (unlikely(opf)) {
				459	pr_warn("unsupported BIO type %x\n", opf);
				460	return -EOPNOTSUPP;
				461	}
				462	WARN_ON(out < 0);
				463	return out;
				464	}
				465
				466	/*
				467	* Not quite what's in blk-map.c, but instead what I thought the functions in
				468	* blk-map did. This one seems more generally useful and I think we could
				469	* write the blk-map version in terms of this one. The differences are that
				470	* this has a return value that counts, and blk-map uses the BIO _all iters.
				471	* Neither advance the BIO iter but don't advance the IOV iter, which is a bit
				472	* odd here.
				473	*/
				474	static ssize_t bio_copy_from_iter(struct bio bio, struct iov_iter iter)
				475	{
				476	struct bio_vec bvec;
				477	struct bvec_iter biter;
				478	ssize_t out = 0;
				479
				480	bio_for_each_segment (bvec, bio, biter) {
				481	ssize_t ret;
				482
				483	ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset,
				484	bvec.bv_len, iter);
				485
				486	/*
				487	* FIXME: I thought that IOV copies had a mechanism for
				488	* terminating early, if for example a signal came in while
				489	* sleeping waiting for a page to be mapped, but I don't see
				490	* where that would happen.
				491	*/
				492	WARN_ON(ret < 0);
				493	out += ret;
				494
				495	if (!iov_iter_count(iter))
				496	break;
				497
				498	if (ret < bvec.bv_len)
				499	return ret;
				500	}
				501
				502	return out;
				503	}
				504
				505	static ssize_t bio_copy_to_iter(struct bio bio, struct iov_iter iter)
				506	{
				507	struct bio_vec bvec;
				508	struct bvec_iter biter;
				509	ssize_t out = 0;
				510
				511	bio_for_each_segment (bvec, bio, biter) {
				512	ssize_t ret;
				513
				514	ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset,
				515	bvec.bv_len, iter);
				516
				517	/* as above */
				518	WARN_ON(ret < 0);
				519	out += ret;
				520
				521	if (!iov_iter_count(iter))
				522	break;
				523
				524	if (ret < bvec.bv_len)
				525	return ret;
				526	}
				527
				528	return out;
				529	}
				530
				531	static ssize_t msg_copy_to_iov(struct message msg, struct iov_iter to)
				532	{
				533	ssize_t copied = 0;
				534
				535	if (!iov_iter_count(to))
				536	return 0;
				537
				538	if (msg->posn_to_user < sizeof(msg->msg)) {
				539	copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user,
				540	sizeof(msg->msg) - msg->posn_to_user, to);
				541	} else {
				542	copied = bio_copy_to_iter(msg->bio, to);
				543	if (copied > 0)
				544	bio_advance(msg->bio, copied);
				545	}
				546
				547	if (copied < 0)
				548	return copied;
				549
				550	msg->posn_to_user += copied;
				551	return copied;
				552	}
				553
				554	static ssize_t msg_copy_from_iov(struct message msg, struct iov_iter from)
				555	{
				556	ssize_t copied = 0;
				557
				558	if (!iov_iter_count(from))
				559	return 0;
				560
				561	if (msg->posn_from_user < sizeof(msg->msg)) {
				562	copied = copy_from_iter(
				563	(char *)(&msg->msg) + msg->posn_from_user,
				564	sizeof(msg->msg) - msg->posn_from_user, from);
				565	} else {
				566	copied = bio_copy_from_iter(msg->bio, from);
				567	if (copied > 0)
				568	bio_advance(msg->bio, copied);
				569	}
				570
				571	if (copied < 0)
				572	return copied;
				573
				574	msg->posn_from_user += copied;
				575	return copied;
				576	}
				577
				578	static struct message msg_get_map(struct target t)
				579	{
				580	struct message *m;
				581
				582	lockdep_assert_held(&t->lock);
				583
				584	m = mempool_alloc(&t->message_pool, GFP_NOIO);
				585	m->msg.seq = t->next_seq_to_map++;
				586	INIT_LIST_HEAD(&m->to_user);
				587	INIT_LIST_HEAD(&m->from_user);
				588	return m;
				589	}
				590
				591	static struct message msg_get_to_user(struct target t)
				592	{
				593	struct message *m;
				594
				595	lockdep_assert_held(&t->lock);
				596
				597	if (list_empty(&t->to_user))
				598	return NULL;
				599
				600	m = list_first_entry(&t->to_user, struct message, to_user);
				601
				602	list_del(&m->to_user);
				603
				604	/*
				605	* If the IO was queued to workqueue since there
				606	* was no daemon to service the IO, then we
				607	* will have to cancel the delayed work as the
				608	* IO will be processed by this user-space thread.
				609	*
				610	* If the delayed work was already picked up for
				611	* processing, then wait for it to complete. Note
				612	* that the IO will not be terminated by the work
				613	* queue thread.
				614	*/
				615	if (unlikely(m->delayed)) {
				616	mutex_unlock(&t->lock);
				617	cancel_delayed_work_sync(&m->work);
				618	mutex_lock(&t->lock);
				619	}
				620	return m;
				621	}
				622
				623	static struct message msg_get_from_user(struct channel c, u64 seq)
				624	{
				625	struct message *m;
				626	struct list_head cur, tmp;
				627
				628	lockdep_assert_held(&c->lock);
				629
				630	list_for_each_safe (cur, tmp, &c->from_user) {
				631	m = list_entry(cur, struct message, from_user);
				632	if (m->msg.seq == seq) {
				633	list_del(&m->from_user);
				634	return m;
				635	}
				636	}
				637
				638	return NULL;
				639	}
				640
				641	/*
				642	* Returns 0 when there is no work left to do. This must be callable without
				643	* holding the target lock, as it is part of the waitqueue's check expression.
				644	* When called without the lock it may spuriously indicate there is remaining
				645	* work, but when called with the lock it must be accurate.
				646	*/
				647	static int target_poll(struct target *t)
				648	{
				649	return !list_empty(&t->to_user) \|\| t->dm_destroyed;
				650	}
				651
				652	static void target_release(struct kref *ref)
				653	{
				654	struct target *t = container_of(ref, struct target, references);
				655	struct list_head cur, tmp;
				656
				657	/*
				658	* There may be outstanding BIOs that have not yet been given to
				659	* userspace. At this point there's nothing we can do about them, as
				660	* there are and will never be any channels.
				661	*/
				662	list_for_each_safe (cur, tmp, &t->to_user) {
				663	struct message *m = list_entry(cur, struct message, to_user);
				664
				665	if (unlikely(m->delayed)) {
				666	bool ret;
				667
				668	mutex_unlock(&t->lock);
				669	ret = cancel_delayed_work_sync(&m->work);
				670	mutex_lock(&t->lock);
				671	if (!ret)
				672	continue;
				673	}
				674	message_kill(m, &t->message_pool);
				675	}
				676
				677	mempool_exit(&t->message_pool);
				678	mutex_unlock(&t->lock);
				679	mutex_destroy(&t->lock);
				680	kfree(t);
				681	}
				682
				683	static void target_put(struct target *t)
				684	{
				685	/*
				686	* This both releases a reference to the target and the lock. We leave
				687	* it up to the caller to hold the lock, as they probably needed it for
				688	* something else.
				689	*/
				690	lockdep_assert_held(&t->lock);
				691
				692	if (!kref_put(&t->references, target_release)) {
				693	/*
				694	* User-space thread is getting terminated.
				695	* We need to scan the list for all those
				696	* pending IO's which were not processed yet
				697	* and put them back to work-queue for delayed
				698	* processing.
				699	*/
				700	if (!is_user_space_thread_present(t)) {
				701	struct list_head cur, tmp;
				702
				703	list_for_each_safe(cur, tmp, &t->to_user) {
				704	struct message *m = list_entry(cur,
				705	struct message,
				706	to_user);
				707	if (!m->delayed)
				708	enqueue_delayed_work(m, false);
				709	}
				710	/*
				711	* Daemon attached to this target is terminated.
				712	*/
				713	t->daemon_terminated = true;
				714	}
				715	mutex_unlock(&t->lock);
				716	}
				717	}
				718
				719	static struct channel channel_alloc(struct target t)
				720	{
				721	struct channel *c;
				722
				723	lockdep_assert_held(&t->lock);
				724
				725	c = kzalloc(sizeof(*c), GFP_KERNEL);
				726	if (c == NULL)
				727	return NULL;
				728
				729	kref_get(&t->references);
				730	c->target = t;
				731	c->cur_from_user = &c->scratch_message_from_user;
				732	mutex_init(&c->lock);
				733	INIT_LIST_HEAD(&c->from_user);
				734	return c;
				735	}
				736
				737	static void channel_free(struct channel *c)
				738	{
				739	struct list_head cur, tmp;
				740
				741	lockdep_assert_held(&c->lock);
				742
				743	/*
				744	* There may be outstanding BIOs that have been given to userspace but
				745	* have not yet been completed. The channel has been shut down so
				746	* there's no way to process the rest of those messages, so we just go
				747	* ahead and error out the BIOs. Hopefully whatever's on the other end
				748	* can handle the errors. One could imagine splitting the BIOs and
				749	* completing as much as we got, but that seems like overkill here.
				750	*
				751	* Our only other options would be to let the BIO hang around (which
				752	* seems way worse) or to resubmit it to userspace in the hope there's
				753	* another channel. I don't really like the idea of submitting a
				754	* message twice.
				755	*/
				756	if (c->cur_to_user != NULL)
				757	message_kill(c->cur_to_user, &c->target->message_pool);
				758	if (c->cur_from_user != &c->scratch_message_from_user)
				759	message_kill(c->cur_from_user, &c->target->message_pool);
				760	list_for_each_safe (cur, tmp, &c->from_user)
				761	message_kill(list_entry(cur, struct message, from_user),
				762	&c->target->message_pool);
				763
				764	mutex_lock(&c->target->lock);
				765	target_put(c->target);
				766	mutex_unlock(&c->lock);
				767	mutex_destroy(&c->lock);
				768	kfree(c);
				769	}
				770
				771	static int dev_open(struct inode inode, struct file file)
				772	{
				773	struct channel *c;
				774	struct target *t;
				775
				776	/*
				777	* This is called by miscdev, which sets private_data to point to the
				778	* struct miscdevice that was opened. The rest of our file operations
				779	* want to refer to the channel that's been opened, so we swap that
				780	* pointer out with a fresh channel.
				781	*
				782	* This is called with the miscdev lock held, which is also held while
				783	* registering/unregistering the miscdev. The miscdev must be
				784	* registered for this to get called, which means there must be an
				785	* outstanding reference to the target, which means it cannot be freed
				786	* out from under us despite us not holding a reference yet.
				787	*/
				788	t = container_of(file->private_data, struct target, miscdev);
				789	mutex_lock(&t->lock);
				790	file->private_data = c = channel_alloc(t);
				791
				792	if (c == NULL) {
				793	mutex_unlock(&t->lock);
				794	return -ENOMEM;
				795	}
				796
				797	mutex_unlock(&t->lock);
				798	return 0;
				799	}
				800
				801	static ssize_t dev_read(struct kiocb iocb, struct iov_iter to)
				802	{
				803	struct channel *c = channel_from_file(iocb->ki_filp);
				804	ssize_t total_processed = 0;
				805	ssize_t processed;
				806
				807	mutex_lock(&c->lock);
				808
				809	if (unlikely(c->to_user_error)) {
				810	total_processed = c->to_user_error;
				811	goto cleanup_unlock;
				812	}
				813
				814	if (c->cur_to_user == NULL) {
				815	struct target *t = target_from_channel(c);
				816
				817	mutex_lock(&t->lock);
				818
				819	while (!target_poll(t)) {
				820	int e;
				821
				822	mutex_unlock(&t->lock);
				823	mutex_unlock(&c->lock);
				824	e = wait_event_interruptible(t->wq, target_poll(t));
				825	mutex_lock(&c->lock);
				826	mutex_lock(&t->lock);
				827
				828	if (unlikely(e != 0)) {
				829	/*
				830	* We haven't processed any bytes in either the
				831	* BIO or the IOV, so we can just terminate
				832	* right now. Elsewhere in the kernel handles
				833	* restarting the syscall when appropriate.
				834	*/
				835	total_processed = e;
				836	mutex_unlock(&t->lock);
				837	goto cleanup_unlock;
				838	}
				839	}
				840
				841	if (unlikely(t->dm_destroyed)) {
				842	/*
				843	* DM has destroyed this target, so just lock
				844	* the user out. There's really nothing else
				845	* we can do here. Note that we don't actually
				846	* tear any thing down until userspace has
				847	* closed the FD, as there may still be
				848	* outstanding BIOs.
				849	*
				850	* This is kind of a wacky error code to
				851	* return. My goal was really just to try and
				852	* find something that wasn't likely to be
				853	* returned by anything else in the miscdev
				854	* path. The message "block device required"
				855	* seems like a somewhat reasonable thing to
				856	* say when the target has disappeared out from
				857	* under us, but "not block" isn't sensible.
				858	*/
				859	c->to_user_error = total_processed = -ENOTBLK;
				860	mutex_unlock(&t->lock);
				861	goto cleanup_unlock;
				862	}
				863
				864	/*
				865	* Ensures that accesses to the message data are not ordered
				866	* before the remote accesses that produce that message data.
				867	*
				868	* This pairs with the barrier in user_map(), via the
				869	* conditional within the while loop above. Also see the lack
				870	* of barrier in user_dtr(), which is why this can be after the
				871	* destroyed check.
				872	*/
				873	smp_rmb();
				874
				875	c->cur_to_user = msg_get_to_user(t);
				876	WARN_ON(c->cur_to_user == NULL);
				877	mutex_unlock(&t->lock);
				878	}
				879
				880	processed = msg_copy_to_iov(c->cur_to_user, to);
				881	total_processed += processed;
				882
				883	WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user);
				884	if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) {
				885	struct message *m = c->cur_to_user;
				886
				887	c->cur_to_user = NULL;
				888	list_add_tail(&m->from_user, &c->from_user);
				889	}
				890
				891	cleanup_unlock:
				892	mutex_unlock(&c->lock);
				893	return total_processed;
				894	}
				895
				896	static ssize_t dev_write(struct kiocb iocb, struct iov_iter from)
				897	{
				898	struct channel *c = channel_from_file(iocb->ki_filp);
				899	ssize_t total_processed = 0;
				900	ssize_t processed;
				901
				902	mutex_lock(&c->lock);
				903
				904	if (unlikely(c->from_user_error)) {
				905	total_processed = c->from_user_error;
				906	goto cleanup_unlock;
				907	}
				908
				909	/*
				910	* cur_from_user can never be NULL. If there's no real message it must
				911	* point to the scratch space.
				912	*/
				913	WARN_ON(c->cur_from_user == NULL);
				914	if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) {
				915	struct message msg, old;
				916
				917	processed = msg_copy_from_iov(c->cur_from_user, from);
				918	if (processed <= 0) {
				919	pr_warn("msg_copy_from_iov() returned %zu\n",
				920	processed);
				921	c->from_user_error = -EINVAL;
				922	goto cleanup_unlock;
				923	}
				924	total_processed += processed;
				925
				926	/*
				927	* In the unlikely event the user has provided us a very short
				928	* write, not even big enough to fill a message, just succeed.
				929	* We'll eventually build up enough bytes to do something.
				930	*/
				931	if (unlikely(c->cur_from_user->posn_from_user <
				932	sizeof(struct dm_user_message)))
				933	goto cleanup_unlock;
				934
				935	old = c->cur_from_user;
				936	mutex_lock(&c->target->lock);
				937	msg = msg_get_from_user(c, c->cur_from_user->msg.seq);
				938	if (msg == NULL) {
				939	pr_info("user provided an invalid messag seq of %llx\n",
				940	old->msg.seq);
				941	mutex_unlock(&c->target->lock);
				942	c->from_user_error = -EINVAL;
				943	goto cleanup_unlock;
				944	}
				945	mutex_unlock(&c->target->lock);
				946
				947	WARN_ON(old->posn_from_user != sizeof(struct dm_user_message));
				948	msg->posn_from_user = sizeof(struct dm_user_message);
				949	msg->return_type = old->msg.type;
				950	msg->return_flags = old->msg.flags;
				951	WARN_ON(msg->posn_from_user > msg->total_from_user);
				952	c->cur_from_user = msg;
				953	WARN_ON(old != &c->scratch_message_from_user);
				954	}
				955
				956	/*
				957	* Userspace can signal an error for single requests by overwriting the
				958	* seq field.
				959	*/
				960	switch (c->cur_from_user->return_type) {
				961	case DM_USER_RESP_SUCCESS:
				962	c->cur_from_user->bio->bi_status = BLK_STS_OK;
				963	break;
				964	case DM_USER_RESP_ERROR:
				965	case DM_USER_RESP_UNSUPPORTED:
				966	default:
				967	c->cur_from_user->bio->bi_status = BLK_STS_IOERR;
				968	goto finish_bio;
				969	}
				970
				971	/*
				972	* The op was a success as far as userspace is concerned, so process
				973	* whatever data may come along with it. The user may provide the BIO
				974	* data in multiple chunks, in which case we don't need to finish the
				975	* BIO.
				976	*/
				977	processed = msg_copy_from_iov(c->cur_from_user, from);
				978	total_processed += processed;
				979
				980	if (c->cur_from_user->posn_from_user <
				981	c->cur_from_user->total_from_user)
				982	goto cleanup_unlock;
				983
				984	finish_bio:
				985	/*
				986	* When we set up this message the BIO's size matched the
				987	* message size, if that's not still the case then something
				988	* has gone off the rails.
				989	*/
				990	WARN_ON(bio_size(c->cur_from_user->bio) != 0);
				991	bio_endio(c->cur_from_user->bio);
				992	bio_put(c->cur_from_user->bio);
				993
				994	/*
				995	* We don't actually need to take the target lock here, as all
				996	* we're doing is freeing the message and mempools have their
				997	* own lock. Each channel has its ows scratch message.
				998	*/
				999	WARN_ON(c->cur_from_user == &c->scratch_message_from_user);
				1000	mempool_free(c->cur_from_user, &c->target->message_pool);
				1001	c->scratch_message_from_user.posn_from_user = 0;
				1002	c->cur_from_user = &c->scratch_message_from_user;
				1003
				1004	cleanup_unlock:
				1005	mutex_unlock(&c->lock);
				1006	return total_processed;
				1007	}
				1008
				1009	static int dev_release(struct inode inode, struct file file)
				1010	{
				1011	struct channel *c;
				1012
				1013	c = channel_from_file(file);
				1014	mutex_lock(&c->lock);
				1015	channel_free(c);
				1016
				1017	return 0;
				1018	}
				1019
				1020	static const struct file_operations file_operations = {
				1021	.owner = THIS_MODULE,
				1022	.open = dev_open,
				1023	.llseek = no_llseek,
				1024	.read_iter = dev_read,
				1025	.write_iter = dev_write,
				1026	.release = dev_release,
				1027	};
				1028
				1029	static int user_ctr(struct dm_target ti, unsigned int argc, char *argv)
				1030	{
				1031	struct target *t;
				1032	int r;
				1033
				1034	if (argc != 3) {
				1035	ti->error = "Invalid argument count";
				1036	r = -EINVAL;
				1037	goto cleanup_none;
				1038	}
				1039
				1040	t = kzalloc(sizeof(*t), GFP_KERNEL);
				1041	if (t == NULL) {
				1042	r = -ENOMEM;
				1043	goto cleanup_none;
				1044	}
				1045	ti->private = t;
				1046
				1047	/* Enable more BIO types. */
				1048	ti->num_discard_bios = 1;
				1049	ti->discards_supported = true;
				1050	ti->num_flush_bios = 1;
				1051	ti->flush_supported = true;
				1052
				1053	/*
				1054	* We begin with a single reference to the target, which is miscdev's
				1055	* reference. This ensures that the target won't be freed
				1056	* until after the miscdev has been unregistered and all extant
				1057	* channels have been closed.
				1058	*/
				1059	kref_init(&t->references);
				1060
				1061	t->daemon_terminated = false;
				1062	mutex_init(&t->lock);
				1063	init_waitqueue_head(&t->wq);
				1064	INIT_LIST_HEAD(&t->to_user);
				1065	mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES,
				1066	sizeof(struct message));
				1067
				1068	t->miscdev.minor = MISC_DYNAMIC_MINOR;
				1069	t->miscdev.fops = &file_operations;
				1070	t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]);
				1071	if (t->miscdev.name == NULL) {
				1072	r = -ENOMEM;
				1073	goto cleanup_message_pool;
				1074	}
				1075
				1076	/*
				1077	* Once the miscdev is registered it can be opened and therefor
				1078	* concurrent references to the channel can happen. Holding the target
				1079	* lock during misc_register() could deadlock. If registration
				1080	* succeeds then we will not access the target again so we just stick a
				1081	* barrier here, which pairs with taking the target lock everywhere
				1082	* else the target is accessed.
				1083	*
				1084	* I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks
				1085	* would mean that we could take the target lock earlier and release it
				1086	* here instead of the memory barrier. I'm not sure that's any better,
				1087	* though, and this isn't on a hot path so it probably doesn't matter
				1088	* either way.
				1089	*/
				1090	smp_mb();
				1091
				1092	r = misc_register(&t->miscdev);
				1093	if (r) {
				1094	DMERR("Unable to register miscdev %s for dm-user",
				1095	t->miscdev.name);
				1096	r = -ENOMEM;
				1097	goto cleanup_misc_name;
				1098	}
				1099
				1100	return 0;
				1101
				1102	cleanup_misc_name:
				1103	kfree(t->miscdev.name);
				1104	cleanup_message_pool:
				1105	mempool_exit(&t->message_pool);
				1106	kfree(t);
				1107	cleanup_none:
				1108	return r;
				1109	}
				1110
				1111	static void user_dtr(struct dm_target *ti)
				1112	{
				1113	struct target *t = target_from_target(ti);
				1114
				1115	/*
				1116	* Removes the miscdev. This must be called without the target lock
				1117	* held to avoid a possible deadlock because our open implementation is
				1118	* called holding the miscdev lock and must later take the target lock.
				1119	*
				1120	* There is no race here because only DM can register/unregister the
				1121	* miscdev, and DM ensures that doesn't happen twice. The internal
				1122	* miscdev lock is sufficient to ensure there are no races between
				1123	* deregistering the miscdev and open.
				1124	*/
				1125	misc_deregister(&t->miscdev);
				1126
				1127	/*
				1128	* We are now free to take the target's lock and drop our reference to
				1129	* the target. There are almost certainly tasks sleeping in read on at
				1130	* least one of the channels associated with this target, this
				1131	* explicitly wakes them up and terminates the read.
				1132	*/
				1133	mutex_lock(&t->lock);
				1134	/*
				1135	* No barrier here, as wait/wake ensures that the flag visibility is
				1136	* correct WRT the wake/sleep state of the target tasks.
				1137	*/
				1138	t->dm_destroyed = true;
				1139	wake_up_all(&t->wq);
				1140	target_put(t);
				1141	}
				1142
				1143	/*
				1144	* Consumes a BIO from device mapper, queueing it up for userspace.
				1145	*/
				1146	static int user_map(struct dm_target ti, struct bio bio)
				1147	{
				1148	struct target *t;
				1149	struct message *entry;
				1150
				1151	t = target_from_target(ti);
				1152	/*
				1153	* FIXME
				1154	*
				1155	* This seems like a bad idea. Specifically, here we're
				1156	* directly on the IO path when we take the target lock, which may also
				1157	* be taken from a user context. The user context doesn't actively
				1158	* trigger anything that may sleep while holding the lock, but this
				1159	* still seems like a bad idea.
				1160	*
				1161	* The obvious way to fix this would be to use a proper queue, which
				1162	* would result in no shared locks between the direct IO path and user
				1163	* tasks. I had a version that did this, but the head-of-line blocking
				1164	* from the circular buffer resulted in us needing a fairly large
				1165	* allocation in order to avoid situations in which the queue fills up
				1166	* and everything goes off the rails.
				1167	*
				1168	* I could jump through a some hoops to avoid a shared lock while still
				1169	* allowing for a large queue, but I'm not actually sure that allowing
				1170	* for very large queues is the right thing to do here. Intuitively it
				1171	* seems better to keep the queues small in here (essentially sized to
				1172	* the user latency for performance reasons only) and rely on returning
				1173	* DM_MAPIO_REQUEUE regularly, as that would give the rest of the
				1174	* kernel more information.
				1175	*
				1176	* I'll spend some time trying to figure out what's going on with
				1177	* DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix
				1178	* this I'm all ears.
				1179	*/
				1180	mutex_lock(&t->lock);
				1181
				1182	/*
				1183	* FIXME
				1184	*
				1185	* The assumption here is that there's no benefit to returning
				1186	* DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not
				1187	* sure that's actually true -- for example, I could imagine users
				1188	* expecting that submitted BIOs are unlikely to fail and therefor
				1189	* relying on submission failure to indicate an unsupported type.
				1190	*
				1191	* There's two ways I can think of to fix this:
				1192	* - Add DM arguments that are parsed during the constructor that
				1193	* allow various dm_target flags to be set that indicate the op
				1194	* types supported by this target. This may make sense for things
				1195	* like discard, where DM can already transform the BIOs to a form
				1196	* that's likely to be supported.
				1197	* - Some sort of pre-filter that allows userspace to hook in here
				1198	* and kill BIOs before marking them as submitted. My guess would
				1199	* be that a userspace round trip is a bad idea here, but a BPF
				1200	* call seems resonable.
				1201	*
				1202	* My guess is that we'd likely want to do both. The first one is easy
				1203	* and gives DM the proper info, so it seems better. The BPF call
				1204	* seems overly complex for just this, but one could imagine wanting to
				1205	* sometimes return _MAPPED and a BPF filter would be the way to do
				1206	* that.
				1207	*
				1208	* For example, in Android we have an in-kernel DM device called
				1209	* "dm-bow" that takes advange of some portion of the space that has
				1210	* been discarded on a device to provide opportunistic block-level
				1211	* backups. While one could imagine just implementing this entirely in
				1212	* userspace, that would come with an appreciable performance penalty.
				1213	* Instead one could keep a BPF program that forwards most accesses
				1214	* directly to the backing block device while informing a userspace
				1215	* daemon of any discarded space and on writes to blocks that are to be
				1216	* backed up.
				1217	*/
				1218	if (unlikely((bio_type_to_user_type(bio) < 0) \|\|
				1219	(bio_flags_to_user_flags(bio) < 0))) {
				1220	mutex_unlock(&t->lock);
				1221	return DM_MAPIO_KILL;
				1222	}
				1223
				1224	entry = msg_get_map(t);
				1225	if (unlikely(entry == NULL)) {
				1226	mutex_unlock(&t->lock);
				1227	return DM_MAPIO_REQUEUE;
				1228	}
				1229
				1230	bio_get(bio);
				1231	entry->msg.type = bio_type_to_user_type(bio);
				1232	entry->msg.flags = bio_flags_to_user_flags(bio);
				1233	entry->msg.sector = bio->bi_iter.bi_sector;
				1234	entry->msg.len = bio_size(bio);
				1235	entry->bio = bio;
				1236	entry->posn_to_user = 0;
				1237	entry->total_to_user = bio_bytes_needed_to_user(bio);
				1238	entry->posn_from_user = 0;
				1239	entry->total_from_user = bio_bytes_needed_from_user(bio);
				1240	entry->delayed = false;
				1241	entry->t = t;
				1242	/* Pairs with the barrier in dev_read() */
				1243	smp_wmb();
				1244	list_add_tail(&entry->to_user, &t->to_user);
				1245
				1246	/*
				1247	* If there is no daemon to process the IO's,
				1248	* queue these messages into a workqueue with
				1249	* a timeout.
				1250	*/
				1251	if (!is_user_space_thread_present(t))
				1252	enqueue_delayed_work(entry, !t->daemon_terminated);
				1253
				1254	wake_up_interruptible(&t->wq);
				1255	mutex_unlock(&t->lock);
				1256	return DM_MAPIO_SUBMITTED;
				1257	}
				1258
				1259	static struct target_type user_target = {
				1260	.name = "user",
				1261	.version = { 1, 0, 0 },
				1262	.module = THIS_MODULE,
				1263	.ctr = user_ctr,
				1264	.dtr = user_dtr,
				1265	.map = user_map,
				1266	};
				1267
				1268	static int __init dm_user_init(void)
				1269	{
				1270	int r;
				1271
				1272	r = dm_register_target(&user_target);
				1273	if (r) {
				1274	DMERR("register failed %d", r);
				1275	goto error;
				1276	}
				1277
				1278	return 0;
				1279
				1280	error:
				1281	return r;
				1282	}
				1283
				1284	static void __exit dm_user_exit(void)
				1285	{
				1286	dm_unregister_target(&user_target);
				1287	}
				1288
				1289	module_init(dm_user_init);
				1290	module_exit(dm_user_exit);
				1291	MODULE_AUTHOR("Palmer Dabbelt <palmerdabbelt@google.com>");
				1292	MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace");
				1293	MODULE_LICENSE("GPL");