b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | // SPDX-License-Identifier: GPL-2.0+ |
| 2 | /* |
| 3 | * Copyright (C) 2020 Google, Inc |
| 4 | * Copyright (C) 2020 Palmer Dabbelt <palmerdabbelt@google.com> |
| 5 | */ |
| 6 | |
| 7 | #include <linux/device-mapper.h> |
| 8 | #include <uapi/linux/dm-user.h> |
| 9 | |
| 10 | #include <linux/bio.h> |
| 11 | #include <linux/init.h> |
| 12 | #include <linux/mempool.h> |
| 13 | #include <linux/miscdevice.h> |
| 14 | #include <linux/module.h> |
| 15 | #include <linux/poll.h> |
| 16 | #include <linux/uio.h> |
| 17 | #include <linux/wait.h> |
| 18 | #include <linux/workqueue.h> |
| 19 | |
| 20 | #define DM_MSG_PREFIX "user" |
| 21 | |
| 22 | #define MAX_OUTSTANDING_MESSAGES 128 |
| 23 | |
| 24 | static unsigned int daemon_timeout_msec = 4000; |
| 25 | module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint, |
| 26 | 0644); |
| 27 | MODULE_PARM_DESC(dm_user_daemon_timeout_msec, |
| 28 | "IO Timeout in msec if daemon does not process"); |
| 29 | |
| 30 | /* |
| 31 | * dm-user uses four structures: |
| 32 | * |
| 33 | * - "struct target", the outermost structure, corresponds to a single device |
| 34 | * mapper target. This contains the set of outstanding BIOs that have been |
| 35 | * provided by DM and are not actively being processed by the user, along |
| 36 | * with a misc device that userspace can open to communicate with the |
| 37 | * kernel. Each time userspaces opens the misc device a new channel is |
| 38 | * created. |
| 39 | * - "struct channel", which represents a single active communication channel |
| 40 | * with userspace. Userspace may choose arbitrary read/write sizes to use |
| 41 | * when processing messages, channels form these into logical accesses. |
| 42 | * When userspace responds to a full message the channel completes the BIO |
| 43 | * and obtains a new message to process from the target. |
| 44 | * - "struct message", which wraps a BIO with the additional information |
| 45 | * required by the kernel to sort out what to do with BIOs when they return |
| 46 | * from userspace. |
| 47 | * - "struct dm_user_message", which is the exact message format that |
| 48 | * userspace sees. |
| 49 | * |
| 50 | * The hot path contains three distinct operations: |
| 51 | * |
| 52 | * - user_map(), which is provided a BIO from device mapper that is queued |
| 53 | * into the target. This allocates and enqueues a new message. |
| 54 | * - dev_read(), which dequeues a message, copies it to userspace. |
| 55 | * - dev_write(), which looks up a message (keyed by sequence number) and |
| 56 | * completes the corresponding BIO. |
| 57 | * |
| 58 | * Lock ordering (outer to inner) |
| 59 | * |
| 60 | * 1) miscdevice's global lock. This is held around dev_open, so it has to be |
| 61 | * the outermost lock. |
| 62 | * 2) target->lock |
| 63 | * 3) channel->lock |
| 64 | */ |
| 65 | |
| 66 | struct message { |
| 67 | /* |
| 68 | * Messages themselves do not need a lock, they're protected by either |
| 69 | * the target or channel's lock, depending on which can reference them |
| 70 | * directly. |
| 71 | */ |
| 72 | struct dm_user_message msg; |
| 73 | struct bio *bio; |
| 74 | size_t posn_to_user; |
| 75 | size_t total_to_user; |
| 76 | size_t posn_from_user; |
| 77 | size_t total_from_user; |
| 78 | |
| 79 | struct list_head from_user; |
| 80 | struct list_head to_user; |
| 81 | |
| 82 | /* |
| 83 | * These are written back from the user. They live in the same spot in |
| 84 | * the message, but we need to either keep the old values around or |
| 85 | * call a bunch more BIO helpers. These are only valid after write has |
| 86 | * adopted the message. |
| 87 | */ |
| 88 | u64 return_type; |
| 89 | u64 return_flags; |
| 90 | |
| 91 | struct delayed_work work; |
| 92 | bool delayed; |
| 93 | struct target *t; |
| 94 | }; |
| 95 | |
| 96 | struct target { |
| 97 | /* |
| 98 | * A target has a single lock, which protects everything in the target |
| 99 | * (but does not protect the channels associated with a target). |
| 100 | */ |
| 101 | struct mutex lock; |
| 102 | |
| 103 | /* |
| 104 | * There is only one point at which anything blocks: userspace blocks |
| 105 | * reading a new message, which is woken up by device mapper providing |
| 106 | * a new BIO to process (or tearing down the target). The |
| 107 | * corresponding write side doesn't block, instead we treat userspace's |
| 108 | * response containing a message that has yet to be mapped as an |
| 109 | * invalid operation. |
| 110 | */ |
| 111 | struct wait_queue_head wq; |
| 112 | |
| 113 | /* |
| 114 | * Messages are delivered to userspace in order, but may be returned |
| 115 | * out of order. This allows userspace to schedule IO if it wants to. |
| 116 | */ |
| 117 | mempool_t message_pool; |
| 118 | u64 next_seq_to_map; |
| 119 | u64 next_seq_to_user; |
| 120 | struct list_head to_user; |
| 121 | |
| 122 | /* |
| 123 | * There is a misc device per target. The name is selected by |
| 124 | * userspace (via a DM create ioctl argument), and each ends up in |
| 125 | * /dev/dm-user/. It looks like a better way to do this may be to have |
| 126 | * a filesystem to manage these, but this was more expedient. The |
| 127 | * current mechanism is functional, but does result in an arbitrary |
| 128 | * number of dynamically created misc devices. |
| 129 | */ |
| 130 | struct miscdevice miscdev; |
| 131 | |
| 132 | /* |
| 133 | * Device mapper's target destructor triggers tearing this all down, |
| 134 | * but we can't actually free until every channel associated with this |
| 135 | * target has been destroyed. Channels each have a reference to their |
| 136 | * target, and there is an additional single reference that corresponds |
| 137 | * to both DM and the misc device (both of which are destroyed by DM). |
| 138 | * |
| 139 | * In the common case userspace will be asleep waiting for a new |
| 140 | * message when device mapper decides to destroy the target, which |
| 141 | * means no new messages will appear. The destroyed flag triggers a |
| 142 | * wakeup, which will end up removing the reference. |
| 143 | */ |
| 144 | struct kref references; |
| 145 | int dm_destroyed; |
| 146 | bool daemon_terminated; |
| 147 | }; |
| 148 | |
| 149 | struct channel { |
| 150 | struct target *target; |
| 151 | |
| 152 | /* |
| 153 | * A channel has a single lock, which prevents multiple reads (or |
| 154 | * multiple writes) from conflicting with each other. |
| 155 | */ |
| 156 | struct mutex lock; |
| 157 | |
| 158 | struct message *cur_to_user; |
| 159 | struct message *cur_from_user; |
| 160 | ssize_t to_user_error; |
| 161 | ssize_t from_user_error; |
| 162 | |
| 163 | /* |
| 164 | * Once a message has been forwarded to userspace on a channel it must |
| 165 | * be responded to on the same channel. This allows us to error out |
| 166 | * the messages that have not yet been responded to by a channel when |
| 167 | * that channel closes, which makes handling errors more reasonable for |
| 168 | * fault-tolerant userspace daemons. It also happens to make avoiding |
| 169 | * shared locks between user_map() and dev_read() a lot easier. |
| 170 | * |
| 171 | * This does preclude a multi-threaded work stealing userspace |
| 172 | * implementation (or at least, force a degree of head-of-line blocking |
| 173 | * on the response path). |
| 174 | */ |
| 175 | struct list_head from_user; |
| 176 | |
| 177 | /* |
| 178 | * Responses from userspace can arrive in arbitrarily small chunks. |
| 179 | * We need some place to buffer one up until we can find the |
| 180 | * corresponding kernel-side message to continue processing, so instead |
| 181 | * of allocating them we just keep one off to the side here. This can |
| 182 | * only ever be pointer to by from_user_cur, and will never have a BIO. |
| 183 | */ |
| 184 | struct message scratch_message_from_user; |
| 185 | }; |
| 186 | |
| 187 | static void message_kill(struct message *m, mempool_t *pool) |
| 188 | { |
| 189 | m->bio->bi_status = BLK_STS_IOERR; |
| 190 | bio_endio(m->bio); |
| 191 | bio_put(m->bio); |
| 192 | mempool_free(m, pool); |
| 193 | } |
| 194 | |
| 195 | static inline bool is_user_space_thread_present(struct target *t) |
| 196 | { |
| 197 | lockdep_assert_held(&t->lock); |
| 198 | return (kref_read(&t->references) > 1); |
| 199 | } |
| 200 | |
| 201 | static void process_delayed_work(struct work_struct *work) |
| 202 | { |
| 203 | struct delayed_work *del_work = to_delayed_work(work); |
| 204 | struct message *msg = container_of(del_work, struct message, work); |
| 205 | |
| 206 | struct target *t = msg->t; |
| 207 | |
| 208 | mutex_lock(&t->lock); |
| 209 | |
| 210 | /* |
| 211 | * There is a atleast one thread to process the IO. |
| 212 | */ |
| 213 | if (is_user_space_thread_present(t)) { |
| 214 | mutex_unlock(&t->lock); |
| 215 | return; |
| 216 | } |
| 217 | |
| 218 | /* |
| 219 | * Terminate the IO with an error |
| 220 | */ |
| 221 | list_del(&msg->to_user); |
| 222 | pr_err("I/O error: sector %llu: no user-space daemon for %s target\n", |
| 223 | msg->bio->bi_iter.bi_sector, |
| 224 | t->miscdev.name); |
| 225 | message_kill(msg, &t->message_pool); |
| 226 | mutex_unlock(&t->lock); |
| 227 | } |
| 228 | |
| 229 | static void enqueue_delayed_work(struct message *m, bool is_delay) |
| 230 | { |
| 231 | unsigned long delay = 0; |
| 232 | |
| 233 | m->delayed = true; |
| 234 | INIT_DELAYED_WORK(&m->work, process_delayed_work); |
| 235 | |
| 236 | /* |
| 237 | * Snapuserd daemon is the user-space process |
| 238 | * which processes IO request from dm-user |
| 239 | * when OTA is applied. Per the current design, |
| 240 | * when a dm-user target is created, daemon |
| 241 | * attaches to target and starts processing |
| 242 | * the IO's. Daemon is terminated only when |
| 243 | * dm-user target is destroyed. |
| 244 | * |
| 245 | * If for some reason, daemon crashes or terminates early, |
| 246 | * without destroying the dm-user target; then |
| 247 | * there is no mechanism to restart the daemon |
| 248 | * and start processing the IO's from the same target. |
| 249 | * Theoretically, it is possible but that infrastructure |
| 250 | * doesn't exist in the android ecosystem. |
| 251 | * |
| 252 | * Thus, when the daemon terminates, there is no way the IO's |
| 253 | * issued on that target will be processed. Hence, |
| 254 | * we set the delay to 0 and fail the IO's immediately. |
| 255 | * |
| 256 | * On the other hand, when a new dm-user target is created, |
| 257 | * we wait for the daemon to get attached for the first time. |
| 258 | * This primarily happens when init first stage spins up |
| 259 | * the daemon. At this point, since the snapshot device is mounted |
| 260 | * of a root filesystem, dm-user target may receive IO request |
| 261 | * even though daemon is not fully launched. We don't want |
| 262 | * to fail those IO requests immediately. Thus, we queue these |
| 263 | * requests with a timeout so that daemon is ready to process |
| 264 | * those IO requests. Again, if the daemon fails to launch within |
| 265 | * the timeout period, then IO's will be failed. |
| 266 | */ |
| 267 | if (is_delay) |
| 268 | delay = msecs_to_jiffies(daemon_timeout_msec); |
| 269 | |
| 270 | queue_delayed_work(system_wq, &m->work, delay); |
| 271 | } |
| 272 | |
| 273 | static inline struct target *target_from_target(struct dm_target *target) |
| 274 | { |
| 275 | WARN_ON(target->private == NULL); |
| 276 | return target->private; |
| 277 | } |
| 278 | |
| 279 | static inline struct target *target_from_miscdev(struct miscdevice *miscdev) |
| 280 | { |
| 281 | return container_of(miscdev, struct target, miscdev); |
| 282 | } |
| 283 | |
| 284 | static inline struct channel *channel_from_file(struct file *file) |
| 285 | { |
| 286 | WARN_ON(file->private_data == NULL); |
| 287 | return file->private_data; |
| 288 | } |
| 289 | |
| 290 | static inline struct target *target_from_channel(struct channel *c) |
| 291 | { |
| 292 | WARN_ON(c->target == NULL); |
| 293 | return c->target; |
| 294 | } |
| 295 | |
| 296 | static inline size_t bio_size(struct bio *bio) |
| 297 | { |
| 298 | struct bio_vec bvec; |
| 299 | struct bvec_iter iter; |
| 300 | size_t out = 0; |
| 301 | |
| 302 | bio_for_each_segment (bvec, bio, iter) |
| 303 | out += bio_iter_len(bio, iter); |
| 304 | return out; |
| 305 | } |
| 306 | |
| 307 | static inline size_t bio_bytes_needed_to_user(struct bio *bio) |
| 308 | { |
| 309 | switch (bio_op(bio)) { |
| 310 | case REQ_OP_WRITE: |
| 311 | return sizeof(struct dm_user_message) + bio_size(bio); |
| 312 | case REQ_OP_READ: |
| 313 | case REQ_OP_FLUSH: |
| 314 | case REQ_OP_DISCARD: |
| 315 | case REQ_OP_SECURE_ERASE: |
| 316 | case REQ_OP_WRITE_SAME: |
| 317 | case REQ_OP_WRITE_ZEROES: |
| 318 | return sizeof(struct dm_user_message); |
| 319 | |
| 320 | /* |
| 321 | * These ops are not passed to userspace under the assumption that |
| 322 | * they're not going to be particularly useful in that context. |
| 323 | */ |
| 324 | default: |
| 325 | return -EOPNOTSUPP; |
| 326 | } |
| 327 | } |
| 328 | |
| 329 | static inline size_t bio_bytes_needed_from_user(struct bio *bio) |
| 330 | { |
| 331 | switch (bio_op(bio)) { |
| 332 | case REQ_OP_READ: |
| 333 | return sizeof(struct dm_user_message) + bio_size(bio); |
| 334 | case REQ_OP_WRITE: |
| 335 | case REQ_OP_FLUSH: |
| 336 | case REQ_OP_DISCARD: |
| 337 | case REQ_OP_SECURE_ERASE: |
| 338 | case REQ_OP_WRITE_SAME: |
| 339 | case REQ_OP_WRITE_ZEROES: |
| 340 | return sizeof(struct dm_user_message); |
| 341 | |
| 342 | /* |
| 343 | * These ops are not passed to userspace under the assumption that |
| 344 | * they're not going to be particularly useful in that context. |
| 345 | */ |
| 346 | default: |
| 347 | return -EOPNOTSUPP; |
| 348 | } |
| 349 | } |
| 350 | |
| 351 | static inline long bio_type_to_user_type(struct bio *bio) |
| 352 | { |
| 353 | switch (bio_op(bio)) { |
| 354 | case REQ_OP_READ: |
| 355 | return DM_USER_REQ_MAP_READ; |
| 356 | case REQ_OP_WRITE: |
| 357 | return DM_USER_REQ_MAP_WRITE; |
| 358 | case REQ_OP_FLUSH: |
| 359 | return DM_USER_REQ_MAP_FLUSH; |
| 360 | case REQ_OP_DISCARD: |
| 361 | return DM_USER_REQ_MAP_DISCARD; |
| 362 | case REQ_OP_SECURE_ERASE: |
| 363 | return DM_USER_REQ_MAP_SECURE_ERASE; |
| 364 | case REQ_OP_WRITE_SAME: |
| 365 | return DM_USER_REQ_MAP_WRITE_SAME; |
| 366 | case REQ_OP_WRITE_ZEROES: |
| 367 | return DM_USER_REQ_MAP_WRITE_ZEROES; |
| 368 | |
| 369 | /* |
| 370 | * These ops are not passed to userspace under the assumption that |
| 371 | * they're not going to be particularly useful in that context. |
| 372 | */ |
| 373 | default: |
| 374 | return -EOPNOTSUPP; |
| 375 | } |
| 376 | } |
| 377 | |
| 378 | static inline long bio_flags_to_user_flags(struct bio *bio) |
| 379 | { |
| 380 | u64 out = 0; |
| 381 | typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK; |
| 382 | |
| 383 | if (opf & REQ_FAILFAST_DEV) { |
| 384 | opf &= ~REQ_FAILFAST_DEV; |
| 385 | out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV; |
| 386 | } |
| 387 | |
| 388 | if (opf & REQ_FAILFAST_TRANSPORT) { |
| 389 | opf &= ~REQ_FAILFAST_TRANSPORT; |
| 390 | out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT; |
| 391 | } |
| 392 | |
| 393 | if (opf & REQ_FAILFAST_DRIVER) { |
| 394 | opf &= ~REQ_FAILFAST_DRIVER; |
| 395 | out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER; |
| 396 | } |
| 397 | |
| 398 | if (opf & REQ_SYNC) { |
| 399 | opf &= ~REQ_SYNC; |
| 400 | out |= DM_USER_REQ_MAP_FLAG_SYNC; |
| 401 | } |
| 402 | |
| 403 | if (opf & REQ_META) { |
| 404 | opf &= ~REQ_META; |
| 405 | out |= DM_USER_REQ_MAP_FLAG_META; |
| 406 | } |
| 407 | |
| 408 | if (opf & REQ_PRIO) { |
| 409 | opf &= ~REQ_PRIO; |
| 410 | out |= DM_USER_REQ_MAP_FLAG_PRIO; |
| 411 | } |
| 412 | |
| 413 | if (opf & REQ_NOMERGE) { |
| 414 | opf &= ~REQ_NOMERGE; |
| 415 | out |= DM_USER_REQ_MAP_FLAG_NOMERGE; |
| 416 | } |
| 417 | |
| 418 | if (opf & REQ_IDLE) { |
| 419 | opf &= ~REQ_IDLE; |
| 420 | out |= DM_USER_REQ_MAP_FLAG_IDLE; |
| 421 | } |
| 422 | |
| 423 | if (opf & REQ_INTEGRITY) { |
| 424 | opf &= ~REQ_INTEGRITY; |
| 425 | out |= DM_USER_REQ_MAP_FLAG_INTEGRITY; |
| 426 | } |
| 427 | |
| 428 | if (opf & REQ_FUA) { |
| 429 | opf &= ~REQ_FUA; |
| 430 | out |= DM_USER_REQ_MAP_FLAG_FUA; |
| 431 | } |
| 432 | |
| 433 | if (opf & REQ_PREFLUSH) { |
| 434 | opf &= ~REQ_PREFLUSH; |
| 435 | out |= DM_USER_REQ_MAP_FLAG_PREFLUSH; |
| 436 | } |
| 437 | |
| 438 | if (opf & REQ_RAHEAD) { |
| 439 | opf &= ~REQ_RAHEAD; |
| 440 | out |= DM_USER_REQ_MAP_FLAG_RAHEAD; |
| 441 | } |
| 442 | |
| 443 | if (opf & REQ_BACKGROUND) { |
| 444 | opf &= ~REQ_BACKGROUND; |
| 445 | out |= DM_USER_REQ_MAP_FLAG_BACKGROUND; |
| 446 | } |
| 447 | |
| 448 | if (opf & REQ_NOWAIT) { |
| 449 | opf &= ~REQ_NOWAIT; |
| 450 | out |= DM_USER_REQ_MAP_FLAG_NOWAIT; |
| 451 | } |
| 452 | |
| 453 | if (opf & REQ_NOUNMAP) { |
| 454 | opf &= ~REQ_NOUNMAP; |
| 455 | out |= DM_USER_REQ_MAP_FLAG_NOUNMAP; |
| 456 | } |
| 457 | |
| 458 | if (unlikely(opf)) { |
| 459 | pr_warn("unsupported BIO type %x\n", opf); |
| 460 | return -EOPNOTSUPP; |
| 461 | } |
| 462 | WARN_ON(out < 0); |
| 463 | return out; |
| 464 | } |
| 465 | |
| 466 | /* |
| 467 | * Not quite what's in blk-map.c, but instead what I thought the functions in |
| 468 | * blk-map did. This one seems more generally useful and I think we could |
| 469 | * write the blk-map version in terms of this one. The differences are that |
| 470 | * this has a return value that counts, and blk-map uses the BIO _all iters. |
| 471 | * Neither advance the BIO iter but don't advance the IOV iter, which is a bit |
| 472 | * odd here. |
| 473 | */ |
| 474 | static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) |
| 475 | { |
| 476 | struct bio_vec bvec; |
| 477 | struct bvec_iter biter; |
| 478 | ssize_t out = 0; |
| 479 | |
| 480 | bio_for_each_segment (bvec, bio, biter) { |
| 481 | ssize_t ret; |
| 482 | |
| 483 | ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset, |
| 484 | bvec.bv_len, iter); |
| 485 | |
| 486 | /* |
| 487 | * FIXME: I thought that IOV copies had a mechanism for |
| 488 | * terminating early, if for example a signal came in while |
| 489 | * sleeping waiting for a page to be mapped, but I don't see |
| 490 | * where that would happen. |
| 491 | */ |
| 492 | WARN_ON(ret < 0); |
| 493 | out += ret; |
| 494 | |
| 495 | if (!iov_iter_count(iter)) |
| 496 | break; |
| 497 | |
| 498 | if (ret < bvec.bv_len) |
| 499 | return ret; |
| 500 | } |
| 501 | |
| 502 | return out; |
| 503 | } |
| 504 | |
| 505 | static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter) |
| 506 | { |
| 507 | struct bio_vec bvec; |
| 508 | struct bvec_iter biter; |
| 509 | ssize_t out = 0; |
| 510 | |
| 511 | bio_for_each_segment (bvec, bio, biter) { |
| 512 | ssize_t ret; |
| 513 | |
| 514 | ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset, |
| 515 | bvec.bv_len, iter); |
| 516 | |
| 517 | /* as above */ |
| 518 | WARN_ON(ret < 0); |
| 519 | out += ret; |
| 520 | |
| 521 | if (!iov_iter_count(iter)) |
| 522 | break; |
| 523 | |
| 524 | if (ret < bvec.bv_len) |
| 525 | return ret; |
| 526 | } |
| 527 | |
| 528 | return out; |
| 529 | } |
| 530 | |
| 531 | static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to) |
| 532 | { |
| 533 | ssize_t copied = 0; |
| 534 | |
| 535 | if (!iov_iter_count(to)) |
| 536 | return 0; |
| 537 | |
| 538 | if (msg->posn_to_user < sizeof(msg->msg)) { |
| 539 | copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user, |
| 540 | sizeof(msg->msg) - msg->posn_to_user, to); |
| 541 | } else { |
| 542 | copied = bio_copy_to_iter(msg->bio, to); |
| 543 | if (copied > 0) |
| 544 | bio_advance(msg->bio, copied); |
| 545 | } |
| 546 | |
| 547 | if (copied < 0) |
| 548 | return copied; |
| 549 | |
| 550 | msg->posn_to_user += copied; |
| 551 | return copied; |
| 552 | } |
| 553 | |
| 554 | static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from) |
| 555 | { |
| 556 | ssize_t copied = 0; |
| 557 | |
| 558 | if (!iov_iter_count(from)) |
| 559 | return 0; |
| 560 | |
| 561 | if (msg->posn_from_user < sizeof(msg->msg)) { |
| 562 | copied = copy_from_iter( |
| 563 | (char *)(&msg->msg) + msg->posn_from_user, |
| 564 | sizeof(msg->msg) - msg->posn_from_user, from); |
| 565 | } else { |
| 566 | copied = bio_copy_from_iter(msg->bio, from); |
| 567 | if (copied > 0) |
| 568 | bio_advance(msg->bio, copied); |
| 569 | } |
| 570 | |
| 571 | if (copied < 0) |
| 572 | return copied; |
| 573 | |
| 574 | msg->posn_from_user += copied; |
| 575 | return copied; |
| 576 | } |
| 577 | |
| 578 | static struct message *msg_get_map(struct target *t) |
| 579 | { |
| 580 | struct message *m; |
| 581 | |
| 582 | lockdep_assert_held(&t->lock); |
| 583 | |
| 584 | m = mempool_alloc(&t->message_pool, GFP_NOIO); |
| 585 | m->msg.seq = t->next_seq_to_map++; |
| 586 | INIT_LIST_HEAD(&m->to_user); |
| 587 | INIT_LIST_HEAD(&m->from_user); |
| 588 | return m; |
| 589 | } |
| 590 | |
| 591 | static struct message *msg_get_to_user(struct target *t) |
| 592 | { |
| 593 | struct message *m; |
| 594 | |
| 595 | lockdep_assert_held(&t->lock); |
| 596 | |
| 597 | if (list_empty(&t->to_user)) |
| 598 | return NULL; |
| 599 | |
| 600 | m = list_first_entry(&t->to_user, struct message, to_user); |
| 601 | |
| 602 | list_del(&m->to_user); |
| 603 | |
| 604 | /* |
| 605 | * If the IO was queued to workqueue since there |
| 606 | * was no daemon to service the IO, then we |
| 607 | * will have to cancel the delayed work as the |
| 608 | * IO will be processed by this user-space thread. |
| 609 | * |
| 610 | * If the delayed work was already picked up for |
| 611 | * processing, then wait for it to complete. Note |
| 612 | * that the IO will not be terminated by the work |
| 613 | * queue thread. |
| 614 | */ |
| 615 | if (unlikely(m->delayed)) { |
| 616 | mutex_unlock(&t->lock); |
| 617 | cancel_delayed_work_sync(&m->work); |
| 618 | mutex_lock(&t->lock); |
| 619 | } |
| 620 | return m; |
| 621 | } |
| 622 | |
| 623 | static struct message *msg_get_from_user(struct channel *c, u64 seq) |
| 624 | { |
| 625 | struct message *m; |
| 626 | struct list_head *cur, *tmp; |
| 627 | |
| 628 | lockdep_assert_held(&c->lock); |
| 629 | |
| 630 | list_for_each_safe (cur, tmp, &c->from_user) { |
| 631 | m = list_entry(cur, struct message, from_user); |
| 632 | if (m->msg.seq == seq) { |
| 633 | list_del(&m->from_user); |
| 634 | return m; |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | return NULL; |
| 639 | } |
| 640 | |
| 641 | /* |
| 642 | * Returns 0 when there is no work left to do. This must be callable without |
| 643 | * holding the target lock, as it is part of the waitqueue's check expression. |
| 644 | * When called without the lock it may spuriously indicate there is remaining |
| 645 | * work, but when called with the lock it must be accurate. |
| 646 | */ |
| 647 | static int target_poll(struct target *t) |
| 648 | { |
| 649 | return !list_empty(&t->to_user) || t->dm_destroyed; |
| 650 | } |
| 651 | |
| 652 | static void target_release(struct kref *ref) |
| 653 | { |
| 654 | struct target *t = container_of(ref, struct target, references); |
| 655 | struct list_head *cur, *tmp; |
| 656 | |
| 657 | /* |
| 658 | * There may be outstanding BIOs that have not yet been given to |
| 659 | * userspace. At this point there's nothing we can do about them, as |
| 660 | * there are and will never be any channels. |
| 661 | */ |
| 662 | list_for_each_safe (cur, tmp, &t->to_user) { |
| 663 | struct message *m = list_entry(cur, struct message, to_user); |
| 664 | |
| 665 | if (unlikely(m->delayed)) { |
| 666 | bool ret; |
| 667 | |
| 668 | mutex_unlock(&t->lock); |
| 669 | ret = cancel_delayed_work_sync(&m->work); |
| 670 | mutex_lock(&t->lock); |
| 671 | if (!ret) |
| 672 | continue; |
| 673 | } |
| 674 | message_kill(m, &t->message_pool); |
| 675 | } |
| 676 | |
| 677 | mempool_exit(&t->message_pool); |
| 678 | mutex_unlock(&t->lock); |
| 679 | mutex_destroy(&t->lock); |
| 680 | kfree(t); |
| 681 | } |
| 682 | |
| 683 | static void target_put(struct target *t) |
| 684 | { |
| 685 | /* |
| 686 | * This both releases a reference to the target and the lock. We leave |
| 687 | * it up to the caller to hold the lock, as they probably needed it for |
| 688 | * something else. |
| 689 | */ |
| 690 | lockdep_assert_held(&t->lock); |
| 691 | |
| 692 | if (!kref_put(&t->references, target_release)) { |
| 693 | /* |
| 694 | * User-space thread is getting terminated. |
| 695 | * We need to scan the list for all those |
| 696 | * pending IO's which were not processed yet |
| 697 | * and put them back to work-queue for delayed |
| 698 | * processing. |
| 699 | */ |
| 700 | if (!is_user_space_thread_present(t)) { |
| 701 | struct list_head *cur, *tmp; |
| 702 | |
| 703 | list_for_each_safe(cur, tmp, &t->to_user) { |
| 704 | struct message *m = list_entry(cur, |
| 705 | struct message, |
| 706 | to_user); |
| 707 | if (!m->delayed) |
| 708 | enqueue_delayed_work(m, false); |
| 709 | } |
| 710 | /* |
| 711 | * Daemon attached to this target is terminated. |
| 712 | */ |
| 713 | t->daemon_terminated = true; |
| 714 | } |
| 715 | mutex_unlock(&t->lock); |
| 716 | } |
| 717 | } |
| 718 | |
| 719 | static struct channel *channel_alloc(struct target *t) |
| 720 | { |
| 721 | struct channel *c; |
| 722 | |
| 723 | lockdep_assert_held(&t->lock); |
| 724 | |
| 725 | c = kzalloc(sizeof(*c), GFP_KERNEL); |
| 726 | if (c == NULL) |
| 727 | return NULL; |
| 728 | |
| 729 | kref_get(&t->references); |
| 730 | c->target = t; |
| 731 | c->cur_from_user = &c->scratch_message_from_user; |
| 732 | mutex_init(&c->lock); |
| 733 | INIT_LIST_HEAD(&c->from_user); |
| 734 | return c; |
| 735 | } |
| 736 | |
| 737 | static void channel_free(struct channel *c) |
| 738 | { |
| 739 | struct list_head *cur, *tmp; |
| 740 | |
| 741 | lockdep_assert_held(&c->lock); |
| 742 | |
| 743 | /* |
| 744 | * There may be outstanding BIOs that have been given to userspace but |
| 745 | * have not yet been completed. The channel has been shut down so |
| 746 | * there's no way to process the rest of those messages, so we just go |
| 747 | * ahead and error out the BIOs. Hopefully whatever's on the other end |
| 748 | * can handle the errors. One could imagine splitting the BIOs and |
| 749 | * completing as much as we got, but that seems like overkill here. |
| 750 | * |
| 751 | * Our only other options would be to let the BIO hang around (which |
| 752 | * seems way worse) or to resubmit it to userspace in the hope there's |
| 753 | * another channel. I don't really like the idea of submitting a |
| 754 | * message twice. |
| 755 | */ |
| 756 | if (c->cur_to_user != NULL) |
| 757 | message_kill(c->cur_to_user, &c->target->message_pool); |
| 758 | if (c->cur_from_user != &c->scratch_message_from_user) |
| 759 | message_kill(c->cur_from_user, &c->target->message_pool); |
| 760 | list_for_each_safe (cur, tmp, &c->from_user) |
| 761 | message_kill(list_entry(cur, struct message, from_user), |
| 762 | &c->target->message_pool); |
| 763 | |
| 764 | mutex_lock(&c->target->lock); |
| 765 | target_put(c->target); |
| 766 | mutex_unlock(&c->lock); |
| 767 | mutex_destroy(&c->lock); |
| 768 | kfree(c); |
| 769 | } |
| 770 | |
| 771 | static int dev_open(struct inode *inode, struct file *file) |
| 772 | { |
| 773 | struct channel *c; |
| 774 | struct target *t; |
| 775 | |
| 776 | /* |
| 777 | * This is called by miscdev, which sets private_data to point to the |
| 778 | * struct miscdevice that was opened. The rest of our file operations |
| 779 | * want to refer to the channel that's been opened, so we swap that |
| 780 | * pointer out with a fresh channel. |
| 781 | * |
| 782 | * This is called with the miscdev lock held, which is also held while |
| 783 | * registering/unregistering the miscdev. The miscdev must be |
| 784 | * registered for this to get called, which means there must be an |
| 785 | * outstanding reference to the target, which means it cannot be freed |
| 786 | * out from under us despite us not holding a reference yet. |
| 787 | */ |
| 788 | t = container_of(file->private_data, struct target, miscdev); |
| 789 | mutex_lock(&t->lock); |
| 790 | file->private_data = c = channel_alloc(t); |
| 791 | |
| 792 | if (c == NULL) { |
| 793 | mutex_unlock(&t->lock); |
| 794 | return -ENOMEM; |
| 795 | } |
| 796 | |
| 797 | mutex_unlock(&t->lock); |
| 798 | return 0; |
| 799 | } |
| 800 | |
| 801 | static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to) |
| 802 | { |
| 803 | struct channel *c = channel_from_file(iocb->ki_filp); |
| 804 | ssize_t total_processed = 0; |
| 805 | ssize_t processed; |
| 806 | |
| 807 | mutex_lock(&c->lock); |
| 808 | |
| 809 | if (unlikely(c->to_user_error)) { |
| 810 | total_processed = c->to_user_error; |
| 811 | goto cleanup_unlock; |
| 812 | } |
| 813 | |
| 814 | if (c->cur_to_user == NULL) { |
| 815 | struct target *t = target_from_channel(c); |
| 816 | |
| 817 | mutex_lock(&t->lock); |
| 818 | |
| 819 | while (!target_poll(t)) { |
| 820 | int e; |
| 821 | |
| 822 | mutex_unlock(&t->lock); |
| 823 | mutex_unlock(&c->lock); |
| 824 | e = wait_event_interruptible(t->wq, target_poll(t)); |
| 825 | mutex_lock(&c->lock); |
| 826 | mutex_lock(&t->lock); |
| 827 | |
| 828 | if (unlikely(e != 0)) { |
| 829 | /* |
| 830 | * We haven't processed any bytes in either the |
| 831 | * BIO or the IOV, so we can just terminate |
| 832 | * right now. Elsewhere in the kernel handles |
| 833 | * restarting the syscall when appropriate. |
| 834 | */ |
| 835 | total_processed = e; |
| 836 | mutex_unlock(&t->lock); |
| 837 | goto cleanup_unlock; |
| 838 | } |
| 839 | } |
| 840 | |
| 841 | if (unlikely(t->dm_destroyed)) { |
| 842 | /* |
| 843 | * DM has destroyed this target, so just lock |
| 844 | * the user out. There's really nothing else |
| 845 | * we can do here. Note that we don't actually |
| 846 | * tear any thing down until userspace has |
| 847 | * closed the FD, as there may still be |
| 848 | * outstanding BIOs. |
| 849 | * |
| 850 | * This is kind of a wacky error code to |
| 851 | * return. My goal was really just to try and |
| 852 | * find something that wasn't likely to be |
| 853 | * returned by anything else in the miscdev |
| 854 | * path. The message "block device required" |
| 855 | * seems like a somewhat reasonable thing to |
| 856 | * say when the target has disappeared out from |
| 857 | * under us, but "not block" isn't sensible. |
| 858 | */ |
| 859 | c->to_user_error = total_processed = -ENOTBLK; |
| 860 | mutex_unlock(&t->lock); |
| 861 | goto cleanup_unlock; |
| 862 | } |
| 863 | |
| 864 | /* |
| 865 | * Ensures that accesses to the message data are not ordered |
| 866 | * before the remote accesses that produce that message data. |
| 867 | * |
| 868 | * This pairs with the barrier in user_map(), via the |
| 869 | * conditional within the while loop above. Also see the lack |
| 870 | * of barrier in user_dtr(), which is why this can be after the |
| 871 | * destroyed check. |
| 872 | */ |
| 873 | smp_rmb(); |
| 874 | |
| 875 | c->cur_to_user = msg_get_to_user(t); |
| 876 | WARN_ON(c->cur_to_user == NULL); |
| 877 | mutex_unlock(&t->lock); |
| 878 | } |
| 879 | |
| 880 | processed = msg_copy_to_iov(c->cur_to_user, to); |
| 881 | total_processed += processed; |
| 882 | |
| 883 | WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user); |
| 884 | if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) { |
| 885 | struct message *m = c->cur_to_user; |
| 886 | |
| 887 | c->cur_to_user = NULL; |
| 888 | list_add_tail(&m->from_user, &c->from_user); |
| 889 | } |
| 890 | |
| 891 | cleanup_unlock: |
| 892 | mutex_unlock(&c->lock); |
| 893 | return total_processed; |
| 894 | } |
| 895 | |
| 896 | static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from) |
| 897 | { |
| 898 | struct channel *c = channel_from_file(iocb->ki_filp); |
| 899 | ssize_t total_processed = 0; |
| 900 | ssize_t processed; |
| 901 | |
| 902 | mutex_lock(&c->lock); |
| 903 | |
| 904 | if (unlikely(c->from_user_error)) { |
| 905 | total_processed = c->from_user_error; |
| 906 | goto cleanup_unlock; |
| 907 | } |
| 908 | |
| 909 | /* |
| 910 | * cur_from_user can never be NULL. If there's no real message it must |
| 911 | * point to the scratch space. |
| 912 | */ |
| 913 | WARN_ON(c->cur_from_user == NULL); |
| 914 | if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) { |
| 915 | struct message *msg, *old; |
| 916 | |
| 917 | processed = msg_copy_from_iov(c->cur_from_user, from); |
| 918 | if (processed <= 0) { |
| 919 | pr_warn("msg_copy_from_iov() returned %zu\n", |
| 920 | processed); |
| 921 | c->from_user_error = -EINVAL; |
| 922 | goto cleanup_unlock; |
| 923 | } |
| 924 | total_processed += processed; |
| 925 | |
| 926 | /* |
| 927 | * In the unlikely event the user has provided us a very short |
| 928 | * write, not even big enough to fill a message, just succeed. |
| 929 | * We'll eventually build up enough bytes to do something. |
| 930 | */ |
| 931 | if (unlikely(c->cur_from_user->posn_from_user < |
| 932 | sizeof(struct dm_user_message))) |
| 933 | goto cleanup_unlock; |
| 934 | |
| 935 | old = c->cur_from_user; |
| 936 | mutex_lock(&c->target->lock); |
| 937 | msg = msg_get_from_user(c, c->cur_from_user->msg.seq); |
| 938 | if (msg == NULL) { |
| 939 | pr_info("user provided an invalid messag seq of %llx\n", |
| 940 | old->msg.seq); |
| 941 | mutex_unlock(&c->target->lock); |
| 942 | c->from_user_error = -EINVAL; |
| 943 | goto cleanup_unlock; |
| 944 | } |
| 945 | mutex_unlock(&c->target->lock); |
| 946 | |
| 947 | WARN_ON(old->posn_from_user != sizeof(struct dm_user_message)); |
| 948 | msg->posn_from_user = sizeof(struct dm_user_message); |
| 949 | msg->return_type = old->msg.type; |
| 950 | msg->return_flags = old->msg.flags; |
| 951 | WARN_ON(msg->posn_from_user > msg->total_from_user); |
| 952 | c->cur_from_user = msg; |
| 953 | WARN_ON(old != &c->scratch_message_from_user); |
| 954 | } |
| 955 | |
| 956 | /* |
| 957 | * Userspace can signal an error for single requests by overwriting the |
| 958 | * seq field. |
| 959 | */ |
| 960 | switch (c->cur_from_user->return_type) { |
| 961 | case DM_USER_RESP_SUCCESS: |
| 962 | c->cur_from_user->bio->bi_status = BLK_STS_OK; |
| 963 | break; |
| 964 | case DM_USER_RESP_ERROR: |
| 965 | case DM_USER_RESP_UNSUPPORTED: |
| 966 | default: |
| 967 | c->cur_from_user->bio->bi_status = BLK_STS_IOERR; |
| 968 | goto finish_bio; |
| 969 | } |
| 970 | |
| 971 | /* |
| 972 | * The op was a success as far as userspace is concerned, so process |
| 973 | * whatever data may come along with it. The user may provide the BIO |
| 974 | * data in multiple chunks, in which case we don't need to finish the |
| 975 | * BIO. |
| 976 | */ |
| 977 | processed = msg_copy_from_iov(c->cur_from_user, from); |
| 978 | total_processed += processed; |
| 979 | |
| 980 | if (c->cur_from_user->posn_from_user < |
| 981 | c->cur_from_user->total_from_user) |
| 982 | goto cleanup_unlock; |
| 983 | |
| 984 | finish_bio: |
| 985 | /* |
| 986 | * When we set up this message the BIO's size matched the |
| 987 | * message size, if that's not still the case then something |
| 988 | * has gone off the rails. |
| 989 | */ |
| 990 | WARN_ON(bio_size(c->cur_from_user->bio) != 0); |
| 991 | bio_endio(c->cur_from_user->bio); |
| 992 | bio_put(c->cur_from_user->bio); |
| 993 | |
| 994 | /* |
| 995 | * We don't actually need to take the target lock here, as all |
| 996 | * we're doing is freeing the message and mempools have their |
| 997 | * own lock. Each channel has its ows scratch message. |
| 998 | */ |
| 999 | WARN_ON(c->cur_from_user == &c->scratch_message_from_user); |
| 1000 | mempool_free(c->cur_from_user, &c->target->message_pool); |
| 1001 | c->scratch_message_from_user.posn_from_user = 0; |
| 1002 | c->cur_from_user = &c->scratch_message_from_user; |
| 1003 | |
| 1004 | cleanup_unlock: |
| 1005 | mutex_unlock(&c->lock); |
| 1006 | return total_processed; |
| 1007 | } |
| 1008 | |
| 1009 | static int dev_release(struct inode *inode, struct file *file) |
| 1010 | { |
| 1011 | struct channel *c; |
| 1012 | |
| 1013 | c = channel_from_file(file); |
| 1014 | mutex_lock(&c->lock); |
| 1015 | channel_free(c); |
| 1016 | |
| 1017 | return 0; |
| 1018 | } |
| 1019 | |
| 1020 | static const struct file_operations file_operations = { |
| 1021 | .owner = THIS_MODULE, |
| 1022 | .open = dev_open, |
| 1023 | .llseek = no_llseek, |
| 1024 | .read_iter = dev_read, |
| 1025 | .write_iter = dev_write, |
| 1026 | .release = dev_release, |
| 1027 | }; |
| 1028 | |
| 1029 | static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
| 1030 | { |
| 1031 | struct target *t; |
| 1032 | int r; |
| 1033 | |
| 1034 | if (argc != 3) { |
| 1035 | ti->error = "Invalid argument count"; |
| 1036 | r = -EINVAL; |
| 1037 | goto cleanup_none; |
| 1038 | } |
| 1039 | |
| 1040 | t = kzalloc(sizeof(*t), GFP_KERNEL); |
| 1041 | if (t == NULL) { |
| 1042 | r = -ENOMEM; |
| 1043 | goto cleanup_none; |
| 1044 | } |
| 1045 | ti->private = t; |
| 1046 | |
| 1047 | /* Enable more BIO types. */ |
| 1048 | ti->num_discard_bios = 1; |
| 1049 | ti->discards_supported = true; |
| 1050 | ti->num_flush_bios = 1; |
| 1051 | ti->flush_supported = true; |
| 1052 | |
| 1053 | /* |
| 1054 | * We begin with a single reference to the target, which is miscdev's |
| 1055 | * reference. This ensures that the target won't be freed |
| 1056 | * until after the miscdev has been unregistered and all extant |
| 1057 | * channels have been closed. |
| 1058 | */ |
| 1059 | kref_init(&t->references); |
| 1060 | |
| 1061 | t->daemon_terminated = false; |
| 1062 | mutex_init(&t->lock); |
| 1063 | init_waitqueue_head(&t->wq); |
| 1064 | INIT_LIST_HEAD(&t->to_user); |
| 1065 | mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES, |
| 1066 | sizeof(struct message)); |
| 1067 | |
| 1068 | t->miscdev.minor = MISC_DYNAMIC_MINOR; |
| 1069 | t->miscdev.fops = &file_operations; |
| 1070 | t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]); |
| 1071 | if (t->miscdev.name == NULL) { |
| 1072 | r = -ENOMEM; |
| 1073 | goto cleanup_message_pool; |
| 1074 | } |
| 1075 | |
| 1076 | /* |
| 1077 | * Once the miscdev is registered it can be opened and therefor |
| 1078 | * concurrent references to the channel can happen. Holding the target |
| 1079 | * lock during misc_register() could deadlock. If registration |
| 1080 | * succeeds then we will not access the target again so we just stick a |
| 1081 | * barrier here, which pairs with taking the target lock everywhere |
| 1082 | * else the target is accessed. |
| 1083 | * |
| 1084 | * I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks |
| 1085 | * would mean that we could take the target lock earlier and release it |
| 1086 | * here instead of the memory barrier. I'm not sure that's any better, |
| 1087 | * though, and this isn't on a hot path so it probably doesn't matter |
| 1088 | * either way. |
| 1089 | */ |
| 1090 | smp_mb(); |
| 1091 | |
| 1092 | r = misc_register(&t->miscdev); |
| 1093 | if (r) { |
| 1094 | DMERR("Unable to register miscdev %s for dm-user", |
| 1095 | t->miscdev.name); |
| 1096 | r = -ENOMEM; |
| 1097 | goto cleanup_misc_name; |
| 1098 | } |
| 1099 | |
| 1100 | return 0; |
| 1101 | |
| 1102 | cleanup_misc_name: |
| 1103 | kfree(t->miscdev.name); |
| 1104 | cleanup_message_pool: |
| 1105 | mempool_exit(&t->message_pool); |
| 1106 | kfree(t); |
| 1107 | cleanup_none: |
| 1108 | return r; |
| 1109 | } |
| 1110 | |
| 1111 | static void user_dtr(struct dm_target *ti) |
| 1112 | { |
| 1113 | struct target *t = target_from_target(ti); |
| 1114 | |
| 1115 | /* |
| 1116 | * Removes the miscdev. This must be called without the target lock |
| 1117 | * held to avoid a possible deadlock because our open implementation is |
| 1118 | * called holding the miscdev lock and must later take the target lock. |
| 1119 | * |
| 1120 | * There is no race here because only DM can register/unregister the |
| 1121 | * miscdev, and DM ensures that doesn't happen twice. The internal |
| 1122 | * miscdev lock is sufficient to ensure there are no races between |
| 1123 | * deregistering the miscdev and open. |
| 1124 | */ |
| 1125 | misc_deregister(&t->miscdev); |
| 1126 | |
| 1127 | /* |
| 1128 | * We are now free to take the target's lock and drop our reference to |
| 1129 | * the target. There are almost certainly tasks sleeping in read on at |
| 1130 | * least one of the channels associated with this target, this |
| 1131 | * explicitly wakes them up and terminates the read. |
| 1132 | */ |
| 1133 | mutex_lock(&t->lock); |
| 1134 | /* |
| 1135 | * No barrier here, as wait/wake ensures that the flag visibility is |
| 1136 | * correct WRT the wake/sleep state of the target tasks. |
| 1137 | */ |
| 1138 | t->dm_destroyed = true; |
| 1139 | wake_up_all(&t->wq); |
| 1140 | target_put(t); |
| 1141 | } |
| 1142 | |
| 1143 | /* |
| 1144 | * Consumes a BIO from device mapper, queueing it up for userspace. |
| 1145 | */ |
| 1146 | static int user_map(struct dm_target *ti, struct bio *bio) |
| 1147 | { |
| 1148 | struct target *t; |
| 1149 | struct message *entry; |
| 1150 | |
| 1151 | t = target_from_target(ti); |
| 1152 | /* |
| 1153 | * FIXME |
| 1154 | * |
| 1155 | * This seems like a bad idea. Specifically, here we're |
| 1156 | * directly on the IO path when we take the target lock, which may also |
| 1157 | * be taken from a user context. The user context doesn't actively |
| 1158 | * trigger anything that may sleep while holding the lock, but this |
| 1159 | * still seems like a bad idea. |
| 1160 | * |
| 1161 | * The obvious way to fix this would be to use a proper queue, which |
| 1162 | * would result in no shared locks between the direct IO path and user |
| 1163 | * tasks. I had a version that did this, but the head-of-line blocking |
| 1164 | * from the circular buffer resulted in us needing a fairly large |
| 1165 | * allocation in order to avoid situations in which the queue fills up |
| 1166 | * and everything goes off the rails. |
| 1167 | * |
| 1168 | * I could jump through a some hoops to avoid a shared lock while still |
| 1169 | * allowing for a large queue, but I'm not actually sure that allowing |
| 1170 | * for very large queues is the right thing to do here. Intuitively it |
| 1171 | * seems better to keep the queues small in here (essentially sized to |
| 1172 | * the user latency for performance reasons only) and rely on returning |
| 1173 | * DM_MAPIO_REQUEUE regularly, as that would give the rest of the |
| 1174 | * kernel more information. |
| 1175 | * |
| 1176 | * I'll spend some time trying to figure out what's going on with |
| 1177 | * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix |
| 1178 | * this I'm all ears. |
| 1179 | */ |
| 1180 | mutex_lock(&t->lock); |
| 1181 | |
| 1182 | /* |
| 1183 | * FIXME |
| 1184 | * |
| 1185 | * The assumption here is that there's no benefit to returning |
| 1186 | * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not |
| 1187 | * sure that's actually true -- for example, I could imagine users |
| 1188 | * expecting that submitted BIOs are unlikely to fail and therefor |
| 1189 | * relying on submission failure to indicate an unsupported type. |
| 1190 | * |
| 1191 | * There's two ways I can think of to fix this: |
| 1192 | * - Add DM arguments that are parsed during the constructor that |
| 1193 | * allow various dm_target flags to be set that indicate the op |
| 1194 | * types supported by this target. This may make sense for things |
| 1195 | * like discard, where DM can already transform the BIOs to a form |
| 1196 | * that's likely to be supported. |
| 1197 | * - Some sort of pre-filter that allows userspace to hook in here |
| 1198 | * and kill BIOs before marking them as submitted. My guess would |
| 1199 | * be that a userspace round trip is a bad idea here, but a BPF |
| 1200 | * call seems resonable. |
| 1201 | * |
| 1202 | * My guess is that we'd likely want to do both. The first one is easy |
| 1203 | * and gives DM the proper info, so it seems better. The BPF call |
| 1204 | * seems overly complex for just this, but one could imagine wanting to |
| 1205 | * sometimes return _MAPPED and a BPF filter would be the way to do |
| 1206 | * that. |
| 1207 | * |
| 1208 | * For example, in Android we have an in-kernel DM device called |
| 1209 | * "dm-bow" that takes advange of some portion of the space that has |
| 1210 | * been discarded on a device to provide opportunistic block-level |
| 1211 | * backups. While one could imagine just implementing this entirely in |
| 1212 | * userspace, that would come with an appreciable performance penalty. |
| 1213 | * Instead one could keep a BPF program that forwards most accesses |
| 1214 | * directly to the backing block device while informing a userspace |
| 1215 | * daemon of any discarded space and on writes to blocks that are to be |
| 1216 | * backed up. |
| 1217 | */ |
| 1218 | if (unlikely((bio_type_to_user_type(bio) < 0) || |
| 1219 | (bio_flags_to_user_flags(bio) < 0))) { |
| 1220 | mutex_unlock(&t->lock); |
| 1221 | return DM_MAPIO_KILL; |
| 1222 | } |
| 1223 | |
| 1224 | entry = msg_get_map(t); |
| 1225 | if (unlikely(entry == NULL)) { |
| 1226 | mutex_unlock(&t->lock); |
| 1227 | return DM_MAPIO_REQUEUE; |
| 1228 | } |
| 1229 | |
| 1230 | bio_get(bio); |
| 1231 | entry->msg.type = bio_type_to_user_type(bio); |
| 1232 | entry->msg.flags = bio_flags_to_user_flags(bio); |
| 1233 | entry->msg.sector = bio->bi_iter.bi_sector; |
| 1234 | entry->msg.len = bio_size(bio); |
| 1235 | entry->bio = bio; |
| 1236 | entry->posn_to_user = 0; |
| 1237 | entry->total_to_user = bio_bytes_needed_to_user(bio); |
| 1238 | entry->posn_from_user = 0; |
| 1239 | entry->total_from_user = bio_bytes_needed_from_user(bio); |
| 1240 | entry->delayed = false; |
| 1241 | entry->t = t; |
| 1242 | /* Pairs with the barrier in dev_read() */ |
| 1243 | smp_wmb(); |
| 1244 | list_add_tail(&entry->to_user, &t->to_user); |
| 1245 | |
| 1246 | /* |
| 1247 | * If there is no daemon to process the IO's, |
| 1248 | * queue these messages into a workqueue with |
| 1249 | * a timeout. |
| 1250 | */ |
| 1251 | if (!is_user_space_thread_present(t)) |
| 1252 | enqueue_delayed_work(entry, !t->daemon_terminated); |
| 1253 | |
| 1254 | wake_up_interruptible(&t->wq); |
| 1255 | mutex_unlock(&t->lock); |
| 1256 | return DM_MAPIO_SUBMITTED; |
| 1257 | } |
| 1258 | |
| 1259 | static struct target_type user_target = { |
| 1260 | .name = "user", |
| 1261 | .version = { 1, 0, 0 }, |
| 1262 | .module = THIS_MODULE, |
| 1263 | .ctr = user_ctr, |
| 1264 | .dtr = user_dtr, |
| 1265 | .map = user_map, |
| 1266 | }; |
| 1267 | |
| 1268 | static int __init dm_user_init(void) |
| 1269 | { |
| 1270 | int r; |
| 1271 | |
| 1272 | r = dm_register_target(&user_target); |
| 1273 | if (r) { |
| 1274 | DMERR("register failed %d", r); |
| 1275 | goto error; |
| 1276 | } |
| 1277 | |
| 1278 | return 0; |
| 1279 | |
| 1280 | error: |
| 1281 | return r; |
| 1282 | } |
| 1283 | |
| 1284 | static void __exit dm_user_exit(void) |
| 1285 | { |
| 1286 | dm_unregister_target(&user_target); |
| 1287 | } |
| 1288 | |
| 1289 | module_init(dm_user_init); |
| 1290 | module_exit(dm_user_exit); |
| 1291 | MODULE_AUTHOR("Palmer Dabbelt <palmerdabbelt@google.com>"); |
| 1292 | MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace"); |
| 1293 | MODULE_LICENSE("GPL"); |