blob: a1d0f1ad2bf6fdb3b52571e11ffd8db73dec465a [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0+
2/*
3 * Copyright (C) 2020 Google, Inc
4 * Copyright (C) 2020 Palmer Dabbelt <palmerdabbelt@google.com>
5 */
6
7#include <linux/device-mapper.h>
8#include <uapi/linux/dm-user.h>
9
10#include <linux/bio.h>
11#include <linux/init.h>
12#include <linux/mempool.h>
13#include <linux/miscdevice.h>
14#include <linux/module.h>
15#include <linux/poll.h>
16#include <linux/uio.h>
17#include <linux/wait.h>
18#include <linux/workqueue.h>
19
20#define DM_MSG_PREFIX "user"
21
22#define MAX_OUTSTANDING_MESSAGES 128
23
24static unsigned int daemon_timeout_msec = 4000;
25module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint,
26 0644);
27MODULE_PARM_DESC(dm_user_daemon_timeout_msec,
28 "IO Timeout in msec if daemon does not process");
29
30/*
31 * dm-user uses four structures:
32 *
33 * - "struct target", the outermost structure, corresponds to a single device
34 * mapper target. This contains the set of outstanding BIOs that have been
35 * provided by DM and are not actively being processed by the user, along
36 * with a misc device that userspace can open to communicate with the
37 * kernel. Each time userspaces opens the misc device a new channel is
38 * created.
39 * - "struct channel", which represents a single active communication channel
40 * with userspace. Userspace may choose arbitrary read/write sizes to use
41 * when processing messages, channels form these into logical accesses.
42 * When userspace responds to a full message the channel completes the BIO
43 * and obtains a new message to process from the target.
44 * - "struct message", which wraps a BIO with the additional information
45 * required by the kernel to sort out what to do with BIOs when they return
46 * from userspace.
47 * - "struct dm_user_message", which is the exact message format that
48 * userspace sees.
49 *
50 * The hot path contains three distinct operations:
51 *
52 * - user_map(), which is provided a BIO from device mapper that is queued
53 * into the target. This allocates and enqueues a new message.
54 * - dev_read(), which dequeues a message, copies it to userspace.
55 * - dev_write(), which looks up a message (keyed by sequence number) and
56 * completes the corresponding BIO.
57 *
58 * Lock ordering (outer to inner)
59 *
60 * 1) miscdevice's global lock. This is held around dev_open, so it has to be
61 * the outermost lock.
62 * 2) target->lock
63 * 3) channel->lock
64 */
65
66struct message {
67 /*
68 * Messages themselves do not need a lock, they're protected by either
69 * the target or channel's lock, depending on which can reference them
70 * directly.
71 */
72 struct dm_user_message msg;
73 struct bio *bio;
74 size_t posn_to_user;
75 size_t total_to_user;
76 size_t posn_from_user;
77 size_t total_from_user;
78
79 struct list_head from_user;
80 struct list_head to_user;
81
82 /*
83 * These are written back from the user. They live in the same spot in
84 * the message, but we need to either keep the old values around or
85 * call a bunch more BIO helpers. These are only valid after write has
86 * adopted the message.
87 */
88 u64 return_type;
89 u64 return_flags;
90
91 struct delayed_work work;
92 bool delayed;
93 struct target *t;
94};
95
96struct target {
97 /*
98 * A target has a single lock, which protects everything in the target
99 * (but does not protect the channels associated with a target).
100 */
101 struct mutex lock;
102
103 /*
104 * There is only one point at which anything blocks: userspace blocks
105 * reading a new message, which is woken up by device mapper providing
106 * a new BIO to process (or tearing down the target). The
107 * corresponding write side doesn't block, instead we treat userspace's
108 * response containing a message that has yet to be mapped as an
109 * invalid operation.
110 */
111 struct wait_queue_head wq;
112
113 /*
114 * Messages are delivered to userspace in order, but may be returned
115 * out of order. This allows userspace to schedule IO if it wants to.
116 */
117 mempool_t message_pool;
118 u64 next_seq_to_map;
119 u64 next_seq_to_user;
120 struct list_head to_user;
121
122 /*
123 * There is a misc device per target. The name is selected by
124 * userspace (via a DM create ioctl argument), and each ends up in
125 * /dev/dm-user/. It looks like a better way to do this may be to have
126 * a filesystem to manage these, but this was more expedient. The
127 * current mechanism is functional, but does result in an arbitrary
128 * number of dynamically created misc devices.
129 */
130 struct miscdevice miscdev;
131
132 /*
133 * Device mapper's target destructor triggers tearing this all down,
134 * but we can't actually free until every channel associated with this
135 * target has been destroyed. Channels each have a reference to their
136 * target, and there is an additional single reference that corresponds
137 * to both DM and the misc device (both of which are destroyed by DM).
138 *
139 * In the common case userspace will be asleep waiting for a new
140 * message when device mapper decides to destroy the target, which
141 * means no new messages will appear. The destroyed flag triggers a
142 * wakeup, which will end up removing the reference.
143 */
144 struct kref references;
145 int dm_destroyed;
146 bool daemon_terminated;
147};
148
149struct channel {
150 struct target *target;
151
152 /*
153 * A channel has a single lock, which prevents multiple reads (or
154 * multiple writes) from conflicting with each other.
155 */
156 struct mutex lock;
157
158 struct message *cur_to_user;
159 struct message *cur_from_user;
160 ssize_t to_user_error;
161 ssize_t from_user_error;
162
163 /*
164 * Once a message has been forwarded to userspace on a channel it must
165 * be responded to on the same channel. This allows us to error out
166 * the messages that have not yet been responded to by a channel when
167 * that channel closes, which makes handling errors more reasonable for
168 * fault-tolerant userspace daemons. It also happens to make avoiding
169 * shared locks between user_map() and dev_read() a lot easier.
170 *
171 * This does preclude a multi-threaded work stealing userspace
172 * implementation (or at least, force a degree of head-of-line blocking
173 * on the response path).
174 */
175 struct list_head from_user;
176
177 /*
178 * Responses from userspace can arrive in arbitrarily small chunks.
179 * We need some place to buffer one up until we can find the
180 * corresponding kernel-side message to continue processing, so instead
181 * of allocating them we just keep one off to the side here. This can
182 * only ever be pointer to by from_user_cur, and will never have a BIO.
183 */
184 struct message scratch_message_from_user;
185};
186
187static void message_kill(struct message *m, mempool_t *pool)
188{
189 m->bio->bi_status = BLK_STS_IOERR;
190 bio_endio(m->bio);
191 bio_put(m->bio);
192 mempool_free(m, pool);
193}
194
195static inline bool is_user_space_thread_present(struct target *t)
196{
197 lockdep_assert_held(&t->lock);
198 return (kref_read(&t->references) > 1);
199}
200
201static void process_delayed_work(struct work_struct *work)
202{
203 struct delayed_work *del_work = to_delayed_work(work);
204 struct message *msg = container_of(del_work, struct message, work);
205
206 struct target *t = msg->t;
207
208 mutex_lock(&t->lock);
209
210 /*
211 * There is a atleast one thread to process the IO.
212 */
213 if (is_user_space_thread_present(t)) {
214 mutex_unlock(&t->lock);
215 return;
216 }
217
218 /*
219 * Terminate the IO with an error
220 */
221 list_del(&msg->to_user);
222 pr_err("I/O error: sector %llu: no user-space daemon for %s target\n",
223 msg->bio->bi_iter.bi_sector,
224 t->miscdev.name);
225 message_kill(msg, &t->message_pool);
226 mutex_unlock(&t->lock);
227}
228
229static void enqueue_delayed_work(struct message *m, bool is_delay)
230{
231 unsigned long delay = 0;
232
233 m->delayed = true;
234 INIT_DELAYED_WORK(&m->work, process_delayed_work);
235
236 /*
237 * Snapuserd daemon is the user-space process
238 * which processes IO request from dm-user
239 * when OTA is applied. Per the current design,
240 * when a dm-user target is created, daemon
241 * attaches to target and starts processing
242 * the IO's. Daemon is terminated only when
243 * dm-user target is destroyed.
244 *
245 * If for some reason, daemon crashes or terminates early,
246 * without destroying the dm-user target; then
247 * there is no mechanism to restart the daemon
248 * and start processing the IO's from the same target.
249 * Theoretically, it is possible but that infrastructure
250 * doesn't exist in the android ecosystem.
251 *
252 * Thus, when the daemon terminates, there is no way the IO's
253 * issued on that target will be processed. Hence,
254 * we set the delay to 0 and fail the IO's immediately.
255 *
256 * On the other hand, when a new dm-user target is created,
257 * we wait for the daemon to get attached for the first time.
258 * This primarily happens when init first stage spins up
259 * the daemon. At this point, since the snapshot device is mounted
260 * of a root filesystem, dm-user target may receive IO request
261 * even though daemon is not fully launched. We don't want
262 * to fail those IO requests immediately. Thus, we queue these
263 * requests with a timeout so that daemon is ready to process
264 * those IO requests. Again, if the daemon fails to launch within
265 * the timeout period, then IO's will be failed.
266 */
267 if (is_delay)
268 delay = msecs_to_jiffies(daemon_timeout_msec);
269
270 queue_delayed_work(system_wq, &m->work, delay);
271}
272
273static inline struct target *target_from_target(struct dm_target *target)
274{
275 WARN_ON(target->private == NULL);
276 return target->private;
277}
278
279static inline struct target *target_from_miscdev(struct miscdevice *miscdev)
280{
281 return container_of(miscdev, struct target, miscdev);
282}
283
284static inline struct channel *channel_from_file(struct file *file)
285{
286 WARN_ON(file->private_data == NULL);
287 return file->private_data;
288}
289
290static inline struct target *target_from_channel(struct channel *c)
291{
292 WARN_ON(c->target == NULL);
293 return c->target;
294}
295
296static inline size_t bio_size(struct bio *bio)
297{
298 struct bio_vec bvec;
299 struct bvec_iter iter;
300 size_t out = 0;
301
302 bio_for_each_segment (bvec, bio, iter)
303 out += bio_iter_len(bio, iter);
304 return out;
305}
306
307static inline size_t bio_bytes_needed_to_user(struct bio *bio)
308{
309 switch (bio_op(bio)) {
310 case REQ_OP_WRITE:
311 return sizeof(struct dm_user_message) + bio_size(bio);
312 case REQ_OP_READ:
313 case REQ_OP_FLUSH:
314 case REQ_OP_DISCARD:
315 case REQ_OP_SECURE_ERASE:
316 case REQ_OP_WRITE_SAME:
317 case REQ_OP_WRITE_ZEROES:
318 return sizeof(struct dm_user_message);
319
320 /*
321 * These ops are not passed to userspace under the assumption that
322 * they're not going to be particularly useful in that context.
323 */
324 default:
325 return -EOPNOTSUPP;
326 }
327}
328
329static inline size_t bio_bytes_needed_from_user(struct bio *bio)
330{
331 switch (bio_op(bio)) {
332 case REQ_OP_READ:
333 return sizeof(struct dm_user_message) + bio_size(bio);
334 case REQ_OP_WRITE:
335 case REQ_OP_FLUSH:
336 case REQ_OP_DISCARD:
337 case REQ_OP_SECURE_ERASE:
338 case REQ_OP_WRITE_SAME:
339 case REQ_OP_WRITE_ZEROES:
340 return sizeof(struct dm_user_message);
341
342 /*
343 * These ops are not passed to userspace under the assumption that
344 * they're not going to be particularly useful in that context.
345 */
346 default:
347 return -EOPNOTSUPP;
348 }
349}
350
351static inline long bio_type_to_user_type(struct bio *bio)
352{
353 switch (bio_op(bio)) {
354 case REQ_OP_READ:
355 return DM_USER_REQ_MAP_READ;
356 case REQ_OP_WRITE:
357 return DM_USER_REQ_MAP_WRITE;
358 case REQ_OP_FLUSH:
359 return DM_USER_REQ_MAP_FLUSH;
360 case REQ_OP_DISCARD:
361 return DM_USER_REQ_MAP_DISCARD;
362 case REQ_OP_SECURE_ERASE:
363 return DM_USER_REQ_MAP_SECURE_ERASE;
364 case REQ_OP_WRITE_SAME:
365 return DM_USER_REQ_MAP_WRITE_SAME;
366 case REQ_OP_WRITE_ZEROES:
367 return DM_USER_REQ_MAP_WRITE_ZEROES;
368
369 /*
370 * These ops are not passed to userspace under the assumption that
371 * they're not going to be particularly useful in that context.
372 */
373 default:
374 return -EOPNOTSUPP;
375 }
376}
377
378static inline long bio_flags_to_user_flags(struct bio *bio)
379{
380 u64 out = 0;
381 typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK;
382
383 if (opf & REQ_FAILFAST_DEV) {
384 opf &= ~REQ_FAILFAST_DEV;
385 out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV;
386 }
387
388 if (opf & REQ_FAILFAST_TRANSPORT) {
389 opf &= ~REQ_FAILFAST_TRANSPORT;
390 out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT;
391 }
392
393 if (opf & REQ_FAILFAST_DRIVER) {
394 opf &= ~REQ_FAILFAST_DRIVER;
395 out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER;
396 }
397
398 if (opf & REQ_SYNC) {
399 opf &= ~REQ_SYNC;
400 out |= DM_USER_REQ_MAP_FLAG_SYNC;
401 }
402
403 if (opf & REQ_META) {
404 opf &= ~REQ_META;
405 out |= DM_USER_REQ_MAP_FLAG_META;
406 }
407
408 if (opf & REQ_PRIO) {
409 opf &= ~REQ_PRIO;
410 out |= DM_USER_REQ_MAP_FLAG_PRIO;
411 }
412
413 if (opf & REQ_NOMERGE) {
414 opf &= ~REQ_NOMERGE;
415 out |= DM_USER_REQ_MAP_FLAG_NOMERGE;
416 }
417
418 if (opf & REQ_IDLE) {
419 opf &= ~REQ_IDLE;
420 out |= DM_USER_REQ_MAP_FLAG_IDLE;
421 }
422
423 if (opf & REQ_INTEGRITY) {
424 opf &= ~REQ_INTEGRITY;
425 out |= DM_USER_REQ_MAP_FLAG_INTEGRITY;
426 }
427
428 if (opf & REQ_FUA) {
429 opf &= ~REQ_FUA;
430 out |= DM_USER_REQ_MAP_FLAG_FUA;
431 }
432
433 if (opf & REQ_PREFLUSH) {
434 opf &= ~REQ_PREFLUSH;
435 out |= DM_USER_REQ_MAP_FLAG_PREFLUSH;
436 }
437
438 if (opf & REQ_RAHEAD) {
439 opf &= ~REQ_RAHEAD;
440 out |= DM_USER_REQ_MAP_FLAG_RAHEAD;
441 }
442
443 if (opf & REQ_BACKGROUND) {
444 opf &= ~REQ_BACKGROUND;
445 out |= DM_USER_REQ_MAP_FLAG_BACKGROUND;
446 }
447
448 if (opf & REQ_NOWAIT) {
449 opf &= ~REQ_NOWAIT;
450 out |= DM_USER_REQ_MAP_FLAG_NOWAIT;
451 }
452
453 if (opf & REQ_NOUNMAP) {
454 opf &= ~REQ_NOUNMAP;
455 out |= DM_USER_REQ_MAP_FLAG_NOUNMAP;
456 }
457
458 if (unlikely(opf)) {
459 pr_warn("unsupported BIO type %x\n", opf);
460 return -EOPNOTSUPP;
461 }
462 WARN_ON(out < 0);
463 return out;
464}
465
466/*
467 * Not quite what's in blk-map.c, but instead what I thought the functions in
468 * blk-map did. This one seems more generally useful and I think we could
469 * write the blk-map version in terms of this one. The differences are that
470 * this has a return value that counts, and blk-map uses the BIO _all iters.
471 * Neither advance the BIO iter but don't advance the IOV iter, which is a bit
472 * odd here.
473 */
474static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
475{
476 struct bio_vec bvec;
477 struct bvec_iter biter;
478 ssize_t out = 0;
479
480 bio_for_each_segment (bvec, bio, biter) {
481 ssize_t ret;
482
483 ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset,
484 bvec.bv_len, iter);
485
486 /*
487 * FIXME: I thought that IOV copies had a mechanism for
488 * terminating early, if for example a signal came in while
489 * sleeping waiting for a page to be mapped, but I don't see
490 * where that would happen.
491 */
492 WARN_ON(ret < 0);
493 out += ret;
494
495 if (!iov_iter_count(iter))
496 break;
497
498 if (ret < bvec.bv_len)
499 return ret;
500 }
501
502 return out;
503}
504
505static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter)
506{
507 struct bio_vec bvec;
508 struct bvec_iter biter;
509 ssize_t out = 0;
510
511 bio_for_each_segment (bvec, bio, biter) {
512 ssize_t ret;
513
514 ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset,
515 bvec.bv_len, iter);
516
517 /* as above */
518 WARN_ON(ret < 0);
519 out += ret;
520
521 if (!iov_iter_count(iter))
522 break;
523
524 if (ret < bvec.bv_len)
525 return ret;
526 }
527
528 return out;
529}
530
531static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to)
532{
533 ssize_t copied = 0;
534
535 if (!iov_iter_count(to))
536 return 0;
537
538 if (msg->posn_to_user < sizeof(msg->msg)) {
539 copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user,
540 sizeof(msg->msg) - msg->posn_to_user, to);
541 } else {
542 copied = bio_copy_to_iter(msg->bio, to);
543 if (copied > 0)
544 bio_advance(msg->bio, copied);
545 }
546
547 if (copied < 0)
548 return copied;
549
550 msg->posn_to_user += copied;
551 return copied;
552}
553
554static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from)
555{
556 ssize_t copied = 0;
557
558 if (!iov_iter_count(from))
559 return 0;
560
561 if (msg->posn_from_user < sizeof(msg->msg)) {
562 copied = copy_from_iter(
563 (char *)(&msg->msg) + msg->posn_from_user,
564 sizeof(msg->msg) - msg->posn_from_user, from);
565 } else {
566 copied = bio_copy_from_iter(msg->bio, from);
567 if (copied > 0)
568 bio_advance(msg->bio, copied);
569 }
570
571 if (copied < 0)
572 return copied;
573
574 msg->posn_from_user += copied;
575 return copied;
576}
577
578static struct message *msg_get_map(struct target *t)
579{
580 struct message *m;
581
582 lockdep_assert_held(&t->lock);
583
584 m = mempool_alloc(&t->message_pool, GFP_NOIO);
585 m->msg.seq = t->next_seq_to_map++;
586 INIT_LIST_HEAD(&m->to_user);
587 INIT_LIST_HEAD(&m->from_user);
588 return m;
589}
590
591static struct message *msg_get_to_user(struct target *t)
592{
593 struct message *m;
594
595 lockdep_assert_held(&t->lock);
596
597 if (list_empty(&t->to_user))
598 return NULL;
599
600 m = list_first_entry(&t->to_user, struct message, to_user);
601
602 list_del(&m->to_user);
603
604 /*
605 * If the IO was queued to workqueue since there
606 * was no daemon to service the IO, then we
607 * will have to cancel the delayed work as the
608 * IO will be processed by this user-space thread.
609 *
610 * If the delayed work was already picked up for
611 * processing, then wait for it to complete. Note
612 * that the IO will not be terminated by the work
613 * queue thread.
614 */
615 if (unlikely(m->delayed)) {
616 mutex_unlock(&t->lock);
617 cancel_delayed_work_sync(&m->work);
618 mutex_lock(&t->lock);
619 }
620 return m;
621}
622
623static struct message *msg_get_from_user(struct channel *c, u64 seq)
624{
625 struct message *m;
626 struct list_head *cur, *tmp;
627
628 lockdep_assert_held(&c->lock);
629
630 list_for_each_safe (cur, tmp, &c->from_user) {
631 m = list_entry(cur, struct message, from_user);
632 if (m->msg.seq == seq) {
633 list_del(&m->from_user);
634 return m;
635 }
636 }
637
638 return NULL;
639}
640
641/*
642 * Returns 0 when there is no work left to do. This must be callable without
643 * holding the target lock, as it is part of the waitqueue's check expression.
644 * When called without the lock it may spuriously indicate there is remaining
645 * work, but when called with the lock it must be accurate.
646 */
647static int target_poll(struct target *t)
648{
649 return !list_empty(&t->to_user) || t->dm_destroyed;
650}
651
652static void target_release(struct kref *ref)
653{
654 struct target *t = container_of(ref, struct target, references);
655 struct list_head *cur, *tmp;
656
657 /*
658 * There may be outstanding BIOs that have not yet been given to
659 * userspace. At this point there's nothing we can do about them, as
660 * there are and will never be any channels.
661 */
662 list_for_each_safe (cur, tmp, &t->to_user) {
663 struct message *m = list_entry(cur, struct message, to_user);
664
665 if (unlikely(m->delayed)) {
666 bool ret;
667
668 mutex_unlock(&t->lock);
669 ret = cancel_delayed_work_sync(&m->work);
670 mutex_lock(&t->lock);
671 if (!ret)
672 continue;
673 }
674 message_kill(m, &t->message_pool);
675 }
676
677 mempool_exit(&t->message_pool);
678 mutex_unlock(&t->lock);
679 mutex_destroy(&t->lock);
680 kfree(t);
681}
682
683static void target_put(struct target *t)
684{
685 /*
686 * This both releases a reference to the target and the lock. We leave
687 * it up to the caller to hold the lock, as they probably needed it for
688 * something else.
689 */
690 lockdep_assert_held(&t->lock);
691
692 if (!kref_put(&t->references, target_release)) {
693 /*
694 * User-space thread is getting terminated.
695 * We need to scan the list for all those
696 * pending IO's which were not processed yet
697 * and put them back to work-queue for delayed
698 * processing.
699 */
700 if (!is_user_space_thread_present(t)) {
701 struct list_head *cur, *tmp;
702
703 list_for_each_safe(cur, tmp, &t->to_user) {
704 struct message *m = list_entry(cur,
705 struct message,
706 to_user);
707 if (!m->delayed)
708 enqueue_delayed_work(m, false);
709 }
710 /*
711 * Daemon attached to this target is terminated.
712 */
713 t->daemon_terminated = true;
714 }
715 mutex_unlock(&t->lock);
716 }
717}
718
719static struct channel *channel_alloc(struct target *t)
720{
721 struct channel *c;
722
723 lockdep_assert_held(&t->lock);
724
725 c = kzalloc(sizeof(*c), GFP_KERNEL);
726 if (c == NULL)
727 return NULL;
728
729 kref_get(&t->references);
730 c->target = t;
731 c->cur_from_user = &c->scratch_message_from_user;
732 mutex_init(&c->lock);
733 INIT_LIST_HEAD(&c->from_user);
734 return c;
735}
736
737static void channel_free(struct channel *c)
738{
739 struct list_head *cur, *tmp;
740
741 lockdep_assert_held(&c->lock);
742
743 /*
744 * There may be outstanding BIOs that have been given to userspace but
745 * have not yet been completed. The channel has been shut down so
746 * there's no way to process the rest of those messages, so we just go
747 * ahead and error out the BIOs. Hopefully whatever's on the other end
748 * can handle the errors. One could imagine splitting the BIOs and
749 * completing as much as we got, but that seems like overkill here.
750 *
751 * Our only other options would be to let the BIO hang around (which
752 * seems way worse) or to resubmit it to userspace in the hope there's
753 * another channel. I don't really like the idea of submitting a
754 * message twice.
755 */
756 if (c->cur_to_user != NULL)
757 message_kill(c->cur_to_user, &c->target->message_pool);
758 if (c->cur_from_user != &c->scratch_message_from_user)
759 message_kill(c->cur_from_user, &c->target->message_pool);
760 list_for_each_safe (cur, tmp, &c->from_user)
761 message_kill(list_entry(cur, struct message, from_user),
762 &c->target->message_pool);
763
764 mutex_lock(&c->target->lock);
765 target_put(c->target);
766 mutex_unlock(&c->lock);
767 mutex_destroy(&c->lock);
768 kfree(c);
769}
770
771static int dev_open(struct inode *inode, struct file *file)
772{
773 struct channel *c;
774 struct target *t;
775
776 /*
777 * This is called by miscdev, which sets private_data to point to the
778 * struct miscdevice that was opened. The rest of our file operations
779 * want to refer to the channel that's been opened, so we swap that
780 * pointer out with a fresh channel.
781 *
782 * This is called with the miscdev lock held, which is also held while
783 * registering/unregistering the miscdev. The miscdev must be
784 * registered for this to get called, which means there must be an
785 * outstanding reference to the target, which means it cannot be freed
786 * out from under us despite us not holding a reference yet.
787 */
788 t = container_of(file->private_data, struct target, miscdev);
789 mutex_lock(&t->lock);
790 file->private_data = c = channel_alloc(t);
791
792 if (c == NULL) {
793 mutex_unlock(&t->lock);
794 return -ENOMEM;
795 }
796
797 mutex_unlock(&t->lock);
798 return 0;
799}
800
801static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to)
802{
803 struct channel *c = channel_from_file(iocb->ki_filp);
804 ssize_t total_processed = 0;
805 ssize_t processed;
806
807 mutex_lock(&c->lock);
808
809 if (unlikely(c->to_user_error)) {
810 total_processed = c->to_user_error;
811 goto cleanup_unlock;
812 }
813
814 if (c->cur_to_user == NULL) {
815 struct target *t = target_from_channel(c);
816
817 mutex_lock(&t->lock);
818
819 while (!target_poll(t)) {
820 int e;
821
822 mutex_unlock(&t->lock);
823 mutex_unlock(&c->lock);
824 e = wait_event_interruptible(t->wq, target_poll(t));
825 mutex_lock(&c->lock);
826 mutex_lock(&t->lock);
827
828 if (unlikely(e != 0)) {
829 /*
830 * We haven't processed any bytes in either the
831 * BIO or the IOV, so we can just terminate
832 * right now. Elsewhere in the kernel handles
833 * restarting the syscall when appropriate.
834 */
835 total_processed = e;
836 mutex_unlock(&t->lock);
837 goto cleanup_unlock;
838 }
839 }
840
841 if (unlikely(t->dm_destroyed)) {
842 /*
843 * DM has destroyed this target, so just lock
844 * the user out. There's really nothing else
845 * we can do here. Note that we don't actually
846 * tear any thing down until userspace has
847 * closed the FD, as there may still be
848 * outstanding BIOs.
849 *
850 * This is kind of a wacky error code to
851 * return. My goal was really just to try and
852 * find something that wasn't likely to be
853 * returned by anything else in the miscdev
854 * path. The message "block device required"
855 * seems like a somewhat reasonable thing to
856 * say when the target has disappeared out from
857 * under us, but "not block" isn't sensible.
858 */
859 c->to_user_error = total_processed = -ENOTBLK;
860 mutex_unlock(&t->lock);
861 goto cleanup_unlock;
862 }
863
864 /*
865 * Ensures that accesses to the message data are not ordered
866 * before the remote accesses that produce that message data.
867 *
868 * This pairs with the barrier in user_map(), via the
869 * conditional within the while loop above. Also see the lack
870 * of barrier in user_dtr(), which is why this can be after the
871 * destroyed check.
872 */
873 smp_rmb();
874
875 c->cur_to_user = msg_get_to_user(t);
876 WARN_ON(c->cur_to_user == NULL);
877 mutex_unlock(&t->lock);
878 }
879
880 processed = msg_copy_to_iov(c->cur_to_user, to);
881 total_processed += processed;
882
883 WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user);
884 if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) {
885 struct message *m = c->cur_to_user;
886
887 c->cur_to_user = NULL;
888 list_add_tail(&m->from_user, &c->from_user);
889 }
890
891cleanup_unlock:
892 mutex_unlock(&c->lock);
893 return total_processed;
894}
895
896static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from)
897{
898 struct channel *c = channel_from_file(iocb->ki_filp);
899 ssize_t total_processed = 0;
900 ssize_t processed;
901
902 mutex_lock(&c->lock);
903
904 if (unlikely(c->from_user_error)) {
905 total_processed = c->from_user_error;
906 goto cleanup_unlock;
907 }
908
909 /*
910 * cur_from_user can never be NULL. If there's no real message it must
911 * point to the scratch space.
912 */
913 WARN_ON(c->cur_from_user == NULL);
914 if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) {
915 struct message *msg, *old;
916
917 processed = msg_copy_from_iov(c->cur_from_user, from);
918 if (processed <= 0) {
919 pr_warn("msg_copy_from_iov() returned %zu\n",
920 processed);
921 c->from_user_error = -EINVAL;
922 goto cleanup_unlock;
923 }
924 total_processed += processed;
925
926 /*
927 * In the unlikely event the user has provided us a very short
928 * write, not even big enough to fill a message, just succeed.
929 * We'll eventually build up enough bytes to do something.
930 */
931 if (unlikely(c->cur_from_user->posn_from_user <
932 sizeof(struct dm_user_message)))
933 goto cleanup_unlock;
934
935 old = c->cur_from_user;
936 mutex_lock(&c->target->lock);
937 msg = msg_get_from_user(c, c->cur_from_user->msg.seq);
938 if (msg == NULL) {
939 pr_info("user provided an invalid messag seq of %llx\n",
940 old->msg.seq);
941 mutex_unlock(&c->target->lock);
942 c->from_user_error = -EINVAL;
943 goto cleanup_unlock;
944 }
945 mutex_unlock(&c->target->lock);
946
947 WARN_ON(old->posn_from_user != sizeof(struct dm_user_message));
948 msg->posn_from_user = sizeof(struct dm_user_message);
949 msg->return_type = old->msg.type;
950 msg->return_flags = old->msg.flags;
951 WARN_ON(msg->posn_from_user > msg->total_from_user);
952 c->cur_from_user = msg;
953 WARN_ON(old != &c->scratch_message_from_user);
954 }
955
956 /*
957 * Userspace can signal an error for single requests by overwriting the
958 * seq field.
959 */
960 switch (c->cur_from_user->return_type) {
961 case DM_USER_RESP_SUCCESS:
962 c->cur_from_user->bio->bi_status = BLK_STS_OK;
963 break;
964 case DM_USER_RESP_ERROR:
965 case DM_USER_RESP_UNSUPPORTED:
966 default:
967 c->cur_from_user->bio->bi_status = BLK_STS_IOERR;
968 goto finish_bio;
969 }
970
971 /*
972 * The op was a success as far as userspace is concerned, so process
973 * whatever data may come along with it. The user may provide the BIO
974 * data in multiple chunks, in which case we don't need to finish the
975 * BIO.
976 */
977 processed = msg_copy_from_iov(c->cur_from_user, from);
978 total_processed += processed;
979
980 if (c->cur_from_user->posn_from_user <
981 c->cur_from_user->total_from_user)
982 goto cleanup_unlock;
983
984finish_bio:
985 /*
986 * When we set up this message the BIO's size matched the
987 * message size, if that's not still the case then something
988 * has gone off the rails.
989 */
990 WARN_ON(bio_size(c->cur_from_user->bio) != 0);
991 bio_endio(c->cur_from_user->bio);
992 bio_put(c->cur_from_user->bio);
993
994 /*
995 * We don't actually need to take the target lock here, as all
996 * we're doing is freeing the message and mempools have their
997 * own lock. Each channel has its ows scratch message.
998 */
999 WARN_ON(c->cur_from_user == &c->scratch_message_from_user);
1000 mempool_free(c->cur_from_user, &c->target->message_pool);
1001 c->scratch_message_from_user.posn_from_user = 0;
1002 c->cur_from_user = &c->scratch_message_from_user;
1003
1004cleanup_unlock:
1005 mutex_unlock(&c->lock);
1006 return total_processed;
1007}
1008
1009static int dev_release(struct inode *inode, struct file *file)
1010{
1011 struct channel *c;
1012
1013 c = channel_from_file(file);
1014 mutex_lock(&c->lock);
1015 channel_free(c);
1016
1017 return 0;
1018}
1019
1020static const struct file_operations file_operations = {
1021 .owner = THIS_MODULE,
1022 .open = dev_open,
1023 .llseek = no_llseek,
1024 .read_iter = dev_read,
1025 .write_iter = dev_write,
1026 .release = dev_release,
1027};
1028
1029static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1030{
1031 struct target *t;
1032 int r;
1033
1034 if (argc != 3) {
1035 ti->error = "Invalid argument count";
1036 r = -EINVAL;
1037 goto cleanup_none;
1038 }
1039
1040 t = kzalloc(sizeof(*t), GFP_KERNEL);
1041 if (t == NULL) {
1042 r = -ENOMEM;
1043 goto cleanup_none;
1044 }
1045 ti->private = t;
1046
1047 /* Enable more BIO types. */
1048 ti->num_discard_bios = 1;
1049 ti->discards_supported = true;
1050 ti->num_flush_bios = 1;
1051 ti->flush_supported = true;
1052
1053 /*
1054 * We begin with a single reference to the target, which is miscdev's
1055 * reference. This ensures that the target won't be freed
1056 * until after the miscdev has been unregistered and all extant
1057 * channels have been closed.
1058 */
1059 kref_init(&t->references);
1060
1061 t->daemon_terminated = false;
1062 mutex_init(&t->lock);
1063 init_waitqueue_head(&t->wq);
1064 INIT_LIST_HEAD(&t->to_user);
1065 mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES,
1066 sizeof(struct message));
1067
1068 t->miscdev.minor = MISC_DYNAMIC_MINOR;
1069 t->miscdev.fops = &file_operations;
1070 t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]);
1071 if (t->miscdev.name == NULL) {
1072 r = -ENOMEM;
1073 goto cleanup_message_pool;
1074 }
1075
1076 /*
1077 * Once the miscdev is registered it can be opened and therefor
1078 * concurrent references to the channel can happen. Holding the target
1079 * lock during misc_register() could deadlock. If registration
1080 * succeeds then we will not access the target again so we just stick a
1081 * barrier here, which pairs with taking the target lock everywhere
1082 * else the target is accessed.
1083 *
1084 * I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks
1085 * would mean that we could take the target lock earlier and release it
1086 * here instead of the memory barrier. I'm not sure that's any better,
1087 * though, and this isn't on a hot path so it probably doesn't matter
1088 * either way.
1089 */
1090 smp_mb();
1091
1092 r = misc_register(&t->miscdev);
1093 if (r) {
1094 DMERR("Unable to register miscdev %s for dm-user",
1095 t->miscdev.name);
1096 r = -ENOMEM;
1097 goto cleanup_misc_name;
1098 }
1099
1100 return 0;
1101
1102cleanup_misc_name:
1103 kfree(t->miscdev.name);
1104cleanup_message_pool:
1105 mempool_exit(&t->message_pool);
1106 kfree(t);
1107cleanup_none:
1108 return r;
1109}
1110
1111static void user_dtr(struct dm_target *ti)
1112{
1113 struct target *t = target_from_target(ti);
1114
1115 /*
1116 * Removes the miscdev. This must be called without the target lock
1117 * held to avoid a possible deadlock because our open implementation is
1118 * called holding the miscdev lock and must later take the target lock.
1119 *
1120 * There is no race here because only DM can register/unregister the
1121 * miscdev, and DM ensures that doesn't happen twice. The internal
1122 * miscdev lock is sufficient to ensure there are no races between
1123 * deregistering the miscdev and open.
1124 */
1125 misc_deregister(&t->miscdev);
1126
1127 /*
1128 * We are now free to take the target's lock and drop our reference to
1129 * the target. There are almost certainly tasks sleeping in read on at
1130 * least one of the channels associated with this target, this
1131 * explicitly wakes them up and terminates the read.
1132 */
1133 mutex_lock(&t->lock);
1134 /*
1135 * No barrier here, as wait/wake ensures that the flag visibility is
1136 * correct WRT the wake/sleep state of the target tasks.
1137 */
1138 t->dm_destroyed = true;
1139 wake_up_all(&t->wq);
1140 target_put(t);
1141}
1142
1143/*
1144 * Consumes a BIO from device mapper, queueing it up for userspace.
1145 */
1146static int user_map(struct dm_target *ti, struct bio *bio)
1147{
1148 struct target *t;
1149 struct message *entry;
1150
1151 t = target_from_target(ti);
1152 /*
1153 * FIXME
1154 *
1155 * This seems like a bad idea. Specifically, here we're
1156 * directly on the IO path when we take the target lock, which may also
1157 * be taken from a user context. The user context doesn't actively
1158 * trigger anything that may sleep while holding the lock, but this
1159 * still seems like a bad idea.
1160 *
1161 * The obvious way to fix this would be to use a proper queue, which
1162 * would result in no shared locks between the direct IO path and user
1163 * tasks. I had a version that did this, but the head-of-line blocking
1164 * from the circular buffer resulted in us needing a fairly large
1165 * allocation in order to avoid situations in which the queue fills up
1166 * and everything goes off the rails.
1167 *
1168 * I could jump through a some hoops to avoid a shared lock while still
1169 * allowing for a large queue, but I'm not actually sure that allowing
1170 * for very large queues is the right thing to do here. Intuitively it
1171 * seems better to keep the queues small in here (essentially sized to
1172 * the user latency for performance reasons only) and rely on returning
1173 * DM_MAPIO_REQUEUE regularly, as that would give the rest of the
1174 * kernel more information.
1175 *
1176 * I'll spend some time trying to figure out what's going on with
1177 * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix
1178 * this I'm all ears.
1179 */
1180 mutex_lock(&t->lock);
1181
1182 /*
1183 * FIXME
1184 *
1185 * The assumption here is that there's no benefit to returning
1186 * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not
1187 * sure that's actually true -- for example, I could imagine users
1188 * expecting that submitted BIOs are unlikely to fail and therefor
1189 * relying on submission failure to indicate an unsupported type.
1190 *
1191 * There's two ways I can think of to fix this:
1192 * - Add DM arguments that are parsed during the constructor that
1193 * allow various dm_target flags to be set that indicate the op
1194 * types supported by this target. This may make sense for things
1195 * like discard, where DM can already transform the BIOs to a form
1196 * that's likely to be supported.
1197 * - Some sort of pre-filter that allows userspace to hook in here
1198 * and kill BIOs before marking them as submitted. My guess would
1199 * be that a userspace round trip is a bad idea here, but a BPF
1200 * call seems resonable.
1201 *
1202 * My guess is that we'd likely want to do both. The first one is easy
1203 * and gives DM the proper info, so it seems better. The BPF call
1204 * seems overly complex for just this, but one could imagine wanting to
1205 * sometimes return _MAPPED and a BPF filter would be the way to do
1206 * that.
1207 *
1208 * For example, in Android we have an in-kernel DM device called
1209 * "dm-bow" that takes advange of some portion of the space that has
1210 * been discarded on a device to provide opportunistic block-level
1211 * backups. While one could imagine just implementing this entirely in
1212 * userspace, that would come with an appreciable performance penalty.
1213 * Instead one could keep a BPF program that forwards most accesses
1214 * directly to the backing block device while informing a userspace
1215 * daemon of any discarded space and on writes to blocks that are to be
1216 * backed up.
1217 */
1218 if (unlikely((bio_type_to_user_type(bio) < 0) ||
1219 (bio_flags_to_user_flags(bio) < 0))) {
1220 mutex_unlock(&t->lock);
1221 return DM_MAPIO_KILL;
1222 }
1223
1224 entry = msg_get_map(t);
1225 if (unlikely(entry == NULL)) {
1226 mutex_unlock(&t->lock);
1227 return DM_MAPIO_REQUEUE;
1228 }
1229
1230 bio_get(bio);
1231 entry->msg.type = bio_type_to_user_type(bio);
1232 entry->msg.flags = bio_flags_to_user_flags(bio);
1233 entry->msg.sector = bio->bi_iter.bi_sector;
1234 entry->msg.len = bio_size(bio);
1235 entry->bio = bio;
1236 entry->posn_to_user = 0;
1237 entry->total_to_user = bio_bytes_needed_to_user(bio);
1238 entry->posn_from_user = 0;
1239 entry->total_from_user = bio_bytes_needed_from_user(bio);
1240 entry->delayed = false;
1241 entry->t = t;
1242 /* Pairs with the barrier in dev_read() */
1243 smp_wmb();
1244 list_add_tail(&entry->to_user, &t->to_user);
1245
1246 /*
1247 * If there is no daemon to process the IO's,
1248 * queue these messages into a workqueue with
1249 * a timeout.
1250 */
1251 if (!is_user_space_thread_present(t))
1252 enqueue_delayed_work(entry, !t->daemon_terminated);
1253
1254 wake_up_interruptible(&t->wq);
1255 mutex_unlock(&t->lock);
1256 return DM_MAPIO_SUBMITTED;
1257}
1258
1259static struct target_type user_target = {
1260 .name = "user",
1261 .version = { 1, 0, 0 },
1262 .module = THIS_MODULE,
1263 .ctr = user_ctr,
1264 .dtr = user_dtr,
1265 .map = user_map,
1266};
1267
1268static int __init dm_user_init(void)
1269{
1270 int r;
1271
1272 r = dm_register_target(&user_target);
1273 if (r) {
1274 DMERR("register failed %d", r);
1275 goto error;
1276 }
1277
1278 return 0;
1279
1280error:
1281 return r;
1282}
1283
1284static void __exit dm_user_exit(void)
1285{
1286 dm_unregister_target(&user_target);
1287}
1288
1289module_init(dm_user_init);
1290module_exit(dm_user_exit);
1291MODULE_AUTHOR("Palmer Dabbelt <palmerdabbelt@google.com>");
1292MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace");
1293MODULE_LICENSE("GPL");