Blame - src/kernel/linux/v4.19/drivers/vhost/vhost.c - T800

blob: 5cf3d1996f47cb955468805abd83da0de8ffb607 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/* Copyright (C) 2009 Red Hat, Inc.
				2	* Copyright (C) 2006 Rusty Russell IBM Corporation
				3	*
				4	* Author: Michael S. Tsirkin <mst@redhat.com>
				5	*
				6	* Inspiration, some code, and most witty comments come from
				7	* Documentation/virtual/lguest/lguest.c, by Rusty Russell
				8	*
				9	* This work is licensed under the terms of the GNU GPL, version 2.
				10	*
				11	* Generic code for virtio server in host kernel.
				12	*/
				13
				14	#include <linux/eventfd.h>
				15	#include <linux/vhost.h>
				16	#include <linux/uio.h>
				17	#include <linux/mm.h>
				18	#include <linux/mmu_context.h>
				19	#include <linux/miscdevice.h>
				20	#include <linux/mutex.h>
				21	#include <linux/poll.h>
				22	#include <linux/file.h>
				23	#include <linux/highmem.h>
				24	#include <linux/slab.h>
				25	#include <linux/vmalloc.h>
				26	#include <linux/kthread.h>
				27	#include <linux/cgroup.h>
				28	#include <linux/module.h>
				29	#include <linux/sort.h>
				30	#include <linux/sched/mm.h>
				31	#include <linux/sched/signal.h>
				32	#include <linux/interval_tree_generic.h>
				33	#include <linux/nospec.h>
				34	#include <linux/kcov.h>
				35
				36	#include "vhost.h"
				37
				38	static ushort max_mem_regions = 64;
				39	module_param(max_mem_regions, ushort, 0444);
				40	MODULE_PARM_DESC(max_mem_regions,
				41	"Maximum number of memory regions in memory map. (default: 64)");
				42	static int max_iotlb_entries = 2048;
				43	module_param(max_iotlb_entries, int, 0444);
				44	MODULE_PARM_DESC(max_iotlb_entries,
				45	"Maximum number of iotlb entries. (default: 2048)");
				46
				47	enum {
				48	VHOST_MEMORY_F_LOG = 0x1,
				49	};
				50
				51	#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
				52	#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
				53
				54	INTERVAL_TREE_DEFINE(struct vhost_umem_node,
				55	rb, __u64, __subtree_last,
				56	START, LAST, static inline, vhost_umem_interval_tree);
				57
				58	#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
				59	static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
				60	{
				61	vq->user_be = !virtio_legacy_is_little_endian();
				62	}
				63
				64	static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
				65	{
				66	vq->user_be = true;
				67	}
				68
				69	static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
				70	{
				71	vq->user_be = false;
				72	}
				73
				74	static long vhost_set_vring_endian(struct vhost_virtqueue vq, int __user argp)
				75	{
				76	struct vhost_vring_state s;
				77
				78	if (vq->private_data)
				79	return -EBUSY;
				80
				81	if (copy_from_user(&s, argp, sizeof(s)))
				82	return -EFAULT;
				83
				84	if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
				85	s.num != VHOST_VRING_BIG_ENDIAN)
				86	return -EINVAL;
				87
				88	if (s.num == VHOST_VRING_BIG_ENDIAN)
				89	vhost_enable_cross_endian_big(vq);
				90	else
				91	vhost_enable_cross_endian_little(vq);
				92
				93	return 0;
				94	}
				95
				96	static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
				97	int __user *argp)
				98	{
				99	struct vhost_vring_state s = {
				100	.index = idx,
				101	.num = vq->user_be
				102	};
				103
				104	if (copy_to_user(argp, &s, sizeof(s)))
				105	return -EFAULT;
				106
				107	return 0;
				108	}
				109
				110	static void vhost_init_is_le(struct vhost_virtqueue *vq)
				111	{
				112	/* Note for legacy virtio: user_be is initialized at reset time
				113	* according to the host endianness. If userspace does not set an
				114	* explicit endianness, the default behavior is native endian, as
				115	* expected by legacy virtio.
				116	*/
				117	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) \|\| !vq->user_be;
				118	}
				119	#else
				120	static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
				121	{
				122	}
				123
				124	static long vhost_set_vring_endian(struct vhost_virtqueue vq, int __user argp)
				125	{
				126	return -ENOIOCTLCMD;
				127	}
				128
				129	static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
				130	int __user *argp)
				131	{
				132	return -ENOIOCTLCMD;
				133	}
				134
				135	static void vhost_init_is_le(struct vhost_virtqueue *vq)
				136	{
				137	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
				138	\|\| virtio_legacy_is_little_endian();
				139	}
				140	#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
				141
				142	static void vhost_reset_is_le(struct vhost_virtqueue *vq)
				143	{
				144	vhost_init_is_le(vq);
				145	}
				146
				147	struct vhost_flush_struct {
				148	struct vhost_work work;
				149	struct completion wait_event;
				150	};
				151
				152	static void vhost_flush_work(struct vhost_work *work)
				153	{
				154	struct vhost_flush_struct *s;
				155
				156	s = container_of(work, struct vhost_flush_struct, work);
				157	complete(&s->wait_event);
				158	}
				159
				160	static void vhost_poll_func(struct file file, wait_queue_head_t wqh,
				161	poll_table *pt)
				162	{
				163	struct vhost_poll *poll;
				164
				165	poll = container_of(pt, struct vhost_poll, table);
				166	poll->wqh = wqh;
				167	add_wait_queue(wqh, &poll->wait);
				168	}
				169
				170	static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
				171	void *key)
				172	{
				173	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
				174
				175	if (!(key_to_poll(key) & poll->mask))
				176	return 0;
				177
				178	vhost_poll_queue(poll);
				179	return 0;
				180	}
				181
				182	void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
				183	{
				184	clear_bit(VHOST_WORK_QUEUED, &work->flags);
				185	work->fn = fn;
				186	}
				187	EXPORT_SYMBOL_GPL(vhost_work_init);
				188
				189	/* Init poll structure */
				190	void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
				191	__poll_t mask, struct vhost_dev *dev)
				192	{
				193	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
				194	init_poll_funcptr(&poll->table, vhost_poll_func);
				195	poll->mask = mask;
				196	poll->dev = dev;
				197	poll->wqh = NULL;
				198
				199	vhost_work_init(&poll->work, fn);
				200	}
				201	EXPORT_SYMBOL_GPL(vhost_poll_init);
				202
				203	/* Start polling a file. We add ourselves to file's wait queue. The caller must
				204	* keep a reference to a file until after vhost_poll_stop is called. */
				205	int vhost_poll_start(struct vhost_poll poll, struct file file)
				206	{
				207	__poll_t mask;
				208	int ret = 0;
				209
				210	if (poll->wqh)
				211	return 0;
				212
				213	mask = vfs_poll(file, &poll->table);
				214	if (mask)
				215	vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
				216	if (mask & EPOLLERR) {
				217	vhost_poll_stop(poll);
				218	ret = -EINVAL;
				219	}
				220
				221	return ret;
				222	}
				223	EXPORT_SYMBOL_GPL(vhost_poll_start);
				224
				225	/* Stop polling a file. After this function returns, it becomes safe to drop the
				226	* file reference. You must also flush afterwards. */
				227	void vhost_poll_stop(struct vhost_poll *poll)
				228	{
				229	if (poll->wqh) {
				230	remove_wait_queue(poll->wqh, &poll->wait);
				231	poll->wqh = NULL;
				232	}
				233	}
				234	EXPORT_SYMBOL_GPL(vhost_poll_stop);
				235
				236	void vhost_work_flush(struct vhost_dev dev, struct vhost_work work)
				237	{
				238	struct vhost_flush_struct flush;
				239
				240	if (dev->worker) {
				241	init_completion(&flush.wait_event);
				242	vhost_work_init(&flush.work, vhost_flush_work);
				243
				244	vhost_work_queue(dev, &flush.work);
				245	wait_for_completion(&flush.wait_event);
				246	}
				247	}
				248	EXPORT_SYMBOL_GPL(vhost_work_flush);
				249
				250	/* Flush any work that has been scheduled. When calling this, don't hold any
				251	* locks that are also used by the callback. */
				252	void vhost_poll_flush(struct vhost_poll *poll)
				253	{
				254	vhost_work_flush(poll->dev, &poll->work);
				255	}
				256	EXPORT_SYMBOL_GPL(vhost_poll_flush);
				257
				258	void vhost_work_queue(struct vhost_dev dev, struct vhost_work work)
				259	{
				260	if (!dev->worker)
				261	return;
				262
				263	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
				264	/* We can only add the work to the list after we're
				265	* sure it was not in the list.
				266	* test_and_set_bit() implies a memory barrier.
				267	*/
				268	llist_add(&work->node, &dev->work_list);
				269	wake_up_process(dev->worker);
				270	}
				271	}
				272	EXPORT_SYMBOL_GPL(vhost_work_queue);
				273
				274	/* A lockless hint for busy polling code to exit the loop */
				275	bool vhost_has_work(struct vhost_dev *dev)
				276	{
				277	return !llist_empty(&dev->work_list);
				278	}
				279	EXPORT_SYMBOL_GPL(vhost_has_work);
				280
				281	void vhost_poll_queue(struct vhost_poll *poll)
				282	{
				283	vhost_work_queue(poll->dev, &poll->work);
				284	}
				285	EXPORT_SYMBOL_GPL(vhost_poll_queue);
				286
				287	static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
				288	{
				289	int j;
				290
				291	for (j = 0; j < VHOST_NUM_ADDRS; j++)
				292	vq->meta_iotlb[j] = NULL;
				293	}
				294
				295	static void vhost_vq_meta_reset(struct vhost_dev *d)
				296	{
				297	int i;
				298
				299	for (i = 0; i < d->nvqs; ++i)
				300	__vhost_vq_meta_reset(d->vqs[i]);
				301	}
				302
				303	static void vhost_vq_reset(struct vhost_dev *dev,
				304	struct vhost_virtqueue *vq)
				305	{
				306	vq->num = 1;
				307	vq->desc = NULL;
				308	vq->avail = NULL;
				309	vq->used = NULL;
				310	vq->last_avail_idx = 0;
				311	vq->avail_idx = 0;
				312	vq->last_used_idx = 0;
				313	vq->signalled_used = 0;
				314	vq->signalled_used_valid = false;
				315	vq->used_flags = 0;
				316	vq->log_used = false;
				317	vq->log_addr = -1ull;
				318	vq->private_data = NULL;
				319	vq->acked_features = 0;
				320	vq->acked_backend_features = 0;
				321	vq->log_base = NULL;
				322	vq->error_ctx = NULL;
				323	vq->kick = NULL;
				324	vq->call_ctx = NULL;
				325	vq->log_ctx = NULL;
				326	vhost_reset_is_le(vq);
				327	vhost_disable_cross_endian(vq);
				328	vq->busyloop_timeout = 0;
				329	vq->umem = NULL;
				330	vq->iotlb = NULL;
				331	__vhost_vq_meta_reset(vq);
				332	}
				333
				334	static int vhost_worker(void *data)
				335	{
				336	struct vhost_dev *dev = data;
				337	struct vhost_work work, work_next;
				338	struct llist_node *node;
				339	mm_segment_t oldfs = get_fs();
				340
				341	set_fs(USER_DS);
				342	use_mm(dev->mm);
				343
				344	for (;;) {
				345	/* mb paired w/ kthread_stop */
				346	set_current_state(TASK_INTERRUPTIBLE);
				347
				348	if (kthread_should_stop()) {
				349	__set_current_state(TASK_RUNNING);
				350	break;
				351	}
				352
				353	node = llist_del_all(&dev->work_list);
				354	if (!node)
				355	schedule();
				356
				357	node = llist_reverse_order(node);
				358	/* make sure flag is seen after deletion */
				359	smp_wmb();
				360	llist_for_each_entry_safe(work, work_next, node, node) {
				361	clear_bit(VHOST_WORK_QUEUED, &work->flags);
				362	__set_current_state(TASK_RUNNING);
				363	kcov_remote_start_common(dev->kcov_handle);
				364	work->fn(work);
				365	kcov_remote_stop();
				366	if (need_resched())
				367	schedule();
				368	}
				369	}
				370	unuse_mm(dev->mm);
				371	set_fs(oldfs);
				372	return 0;
				373	}
				374
				375	static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
				376	{
				377	kfree(vq->indirect);
				378	vq->indirect = NULL;
				379	kfree(vq->log);
				380	vq->log = NULL;
				381	kfree(vq->heads);
				382	vq->heads = NULL;
				383	}
				384
				385	/* Helper to allocate iovec buffers for all vqs. */
				386	static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
				387	{
				388	struct vhost_virtqueue *vq;
				389	int i;
				390
				391	for (i = 0; i < dev->nvqs; ++i) {
				392	vq = dev->vqs[i];
				393	vq->indirect = kmalloc_array(UIO_MAXIOV,
				394	sizeof(*vq->indirect),
				395	GFP_KERNEL);
				396	vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
				397	GFP_KERNEL);
				398	vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
				399	GFP_KERNEL);
				400	if (!vq->indirect \|\| !vq->log \|\| !vq->heads)
				401	goto err_nomem;
				402	}
				403	return 0;
				404
				405	err_nomem:
				406	for (; i >= 0; --i)
				407	vhost_vq_free_iovecs(dev->vqs[i]);
				408	return -ENOMEM;
				409	}
				410
				411	static void vhost_dev_free_iovecs(struct vhost_dev *dev)
				412	{
				413	int i;
				414
				415	for (i = 0; i < dev->nvqs; ++i)
				416	vhost_vq_free_iovecs(dev->vqs[i]);
				417	}
				418
				419	bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
				420	int pkts, int total_len)
				421	{
				422	struct vhost_dev *dev = vq->dev;
				423
				424	if ((dev->byte_weight && total_len >= dev->byte_weight) \|\|
				425	pkts >= dev->weight) {
				426	vhost_poll_queue(&vq->poll);
				427	return true;
				428	}
				429
				430	return false;
				431	}
				432	EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
				433
				434	void vhost_dev_init(struct vhost_dev *dev,
				435	struct vhost_virtqueue **vqs, int nvqs,
				436	int iov_limit, int weight, int byte_weight)
				437	{
				438	struct vhost_virtqueue *vq;
				439	int i;
				440
				441	dev->vqs = vqs;
				442	dev->nvqs = nvqs;
				443	mutex_init(&dev->mutex);
				444	dev->log_ctx = NULL;
				445	dev->umem = NULL;
				446	dev->iotlb = NULL;
				447	dev->mm = NULL;
				448	dev->worker = NULL;
				449	dev->iov_limit = iov_limit;
				450	dev->weight = weight;
				451	dev->byte_weight = byte_weight;
				452	init_llist_head(&dev->work_list);
				453	init_waitqueue_head(&dev->wait);
				454	INIT_LIST_HEAD(&dev->read_list);
				455	INIT_LIST_HEAD(&dev->pending_list);
				456	spin_lock_init(&dev->iotlb_lock);
				457
				458
				459	for (i = 0; i < dev->nvqs; ++i) {
				460	vq = dev->vqs[i];
				461	vq->log = NULL;
				462	vq->indirect = NULL;
				463	vq->heads = NULL;
				464	vq->dev = dev;
				465	mutex_init(&vq->mutex);
				466	vhost_vq_reset(dev, vq);
				467	if (vq->handle_kick)
				468	vhost_poll_init(&vq->poll, vq->handle_kick,
				469	EPOLLIN, dev);
				470	}
				471	}
				472	EXPORT_SYMBOL_GPL(vhost_dev_init);
				473
				474	/* Caller should have device mutex */
				475	long vhost_dev_check_owner(struct vhost_dev *dev)
				476	{
				477	/* Are you the owner? If not, I don't think you mean to do that */
				478	return dev->mm == current->mm ? 0 : -EPERM;
				479	}
				480	EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
				481
				482	struct vhost_attach_cgroups_struct {
				483	struct vhost_work work;
				484	struct task_struct *owner;
				485	int ret;
				486	};
				487
				488	static void vhost_attach_cgroups_work(struct vhost_work *work)
				489	{
				490	struct vhost_attach_cgroups_struct *s;
				491
				492	s = container_of(work, struct vhost_attach_cgroups_struct, work);
				493	s->ret = cgroup_attach_task_all(s->owner, current);
				494	}
				495
				496	static int vhost_attach_cgroups(struct vhost_dev *dev)
				497	{
				498	struct vhost_attach_cgroups_struct attach;
				499
				500	attach.owner = current;
				501	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
				502	vhost_work_queue(dev, &attach.work);
				503	vhost_work_flush(dev, &attach.work);
				504	return attach.ret;
				505	}
				506
				507	/* Caller should have device mutex */
				508	bool vhost_dev_has_owner(struct vhost_dev *dev)
				509	{
				510	return dev->mm;
				511	}
				512	EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
				513
				514	/* Caller should have device mutex */
				515	long vhost_dev_set_owner(struct vhost_dev *dev)
				516	{
				517	struct task_struct *worker;
				518	int err;
				519
				520	/* Is there an owner already? */
				521	if (vhost_dev_has_owner(dev)) {
				522	err = -EBUSY;
				523	goto err_mm;
				524	}
				525
				526	/* No owner, become one */
				527	dev->mm = get_task_mm(current);
				528	dev->kcov_handle = kcov_common_handle();
				529	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
				530	if (IS_ERR(worker)) {
				531	err = PTR_ERR(worker);
				532	goto err_worker;
				533	}
				534
				535	dev->worker = worker;
				536	wake_up_process(worker); /* avoid contributing to loadavg */
				537
				538	err = vhost_attach_cgroups(dev);
				539	if (err)
				540	goto err_cgroup;
				541
				542	err = vhost_dev_alloc_iovecs(dev);
				543	if (err)
				544	goto err_cgroup;
				545
				546	return 0;
				547	err_cgroup:
				548	kthread_stop(worker);
				549	dev->worker = NULL;
				550	err_worker:
				551	if (dev->mm)
				552	mmput(dev->mm);
				553	dev->mm = NULL;
				554	dev->kcov_handle = 0;
				555	err_mm:
				556	return err;
				557	}
				558	EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
				559
				560	struct vhost_umem *vhost_dev_reset_owner_prepare(void)
				561	{
				562	return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL);
				563	}
				564	EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
				565
				566	/* Caller should have device mutex */
				567	void vhost_dev_reset_owner(struct vhost_dev dev, struct vhost_umem umem)
				568	{
				569	int i;
				570
				571	vhost_dev_cleanup(dev);
				572
				573	/* Restore memory to default empty mapping. */
				574	INIT_LIST_HEAD(&umem->umem_list);
				575	dev->umem = umem;
				576	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
				577	* VQs aren't running.
				578	*/
				579	for (i = 0; i < dev->nvqs; ++i)
				580	dev->vqs[i]->umem = umem;
				581	}
				582	EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
				583
				584	void vhost_dev_stop(struct vhost_dev *dev)
				585	{
				586	int i;
				587
				588	for (i = 0; i < dev->nvqs; ++i) {
				589	if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
				590	vhost_poll_stop(&dev->vqs[i]->poll);
				591	vhost_poll_flush(&dev->vqs[i]->poll);
				592	}
				593	}
				594	}
				595	EXPORT_SYMBOL_GPL(vhost_dev_stop);
				596
				597	static void vhost_umem_free(struct vhost_umem *umem,
				598	struct vhost_umem_node *node)
				599	{
				600	vhost_umem_interval_tree_remove(node, &umem->umem_tree);
				601	list_del(&node->link);
				602	kfree(node);
				603	umem->numem--;
				604	}
				605
				606	static void vhost_umem_clean(struct vhost_umem *umem)
				607	{
				608	struct vhost_umem_node node, tmp;
				609
				610	if (!umem)
				611	return;
				612
				613	list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
				614	vhost_umem_free(umem, node);
				615
				616	kvfree(umem);
				617	}
				618
				619	static void vhost_clear_msg(struct vhost_dev *dev)
				620	{
				621	struct vhost_msg_node node, n;
				622
				623	spin_lock(&dev->iotlb_lock);
				624
				625	list_for_each_entry_safe(node, n, &dev->read_list, node) {
				626	list_del(&node->node);
				627	kfree(node);
				628	}
				629
				630	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
				631	list_del(&node->node);
				632	kfree(node);
				633	}
				634
				635	spin_unlock(&dev->iotlb_lock);
				636	}
				637
				638	void vhost_dev_cleanup(struct vhost_dev *dev)
				639	{
				640	int i;
				641
				642	for (i = 0; i < dev->nvqs; ++i) {
				643	if (dev->vqs[i]->error_ctx)
				644	eventfd_ctx_put(dev->vqs[i]->error_ctx);
				645	if (dev->vqs[i]->kick)
				646	fput(dev->vqs[i]->kick);
				647	if (dev->vqs[i]->call_ctx)
				648	eventfd_ctx_put(dev->vqs[i]->call_ctx);
				649	vhost_vq_reset(dev, dev->vqs[i]);
				650	}
				651	vhost_dev_free_iovecs(dev);
				652	if (dev->log_ctx)
				653	eventfd_ctx_put(dev->log_ctx);
				654	dev->log_ctx = NULL;
				655	/* No one will access memory at this point */
				656	vhost_umem_clean(dev->umem);
				657	dev->umem = NULL;
				658	vhost_umem_clean(dev->iotlb);
				659	dev->iotlb = NULL;
				660	vhost_clear_msg(dev);
				661	wake_up_interruptible_poll(&dev->wait, EPOLLIN \| EPOLLRDNORM);
				662	WARN_ON(!llist_empty(&dev->work_list));
				663	if (dev->worker) {
				664	kthread_stop(dev->worker);
				665	dev->worker = NULL;
				666	dev->kcov_handle = 0;
				667	}
				668	if (dev->mm)
				669	mmput(dev->mm);
				670	dev->mm = NULL;
				671	}
				672	EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
				673
				674	static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
				675	{
				676	u64 a = addr / VHOST_PAGE_SIZE / 8;
				677
				678	/* Make sure 64 bit math will not overflow. */
				679	if (a > ULONG_MAX - (unsigned long)log_base \|\|
				680	a + (unsigned long)log_base > ULONG_MAX)
				681	return false;
				682
				683	return access_ok(VERIFY_WRITE, log_base + a,
				684	(sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
				685	}
				686
				687	static bool vhost_overflow(u64 uaddr, u64 size)
				688	{
				689	/* Make sure 64 bit math will not overflow. */
				690	return uaddr > ULONG_MAX \|\| size > ULONG_MAX \|\| uaddr > ULONG_MAX - size;
				691	}
				692
				693	/* Caller should have vq mutex and device mutex. */
				694	static bool vq_memory_access_ok(void __user log_base, struct vhost_umem umem,
				695	int log_all)
				696	{
				697	struct vhost_umem_node *node;
				698
				699	if (!umem)
				700	return false;
				701
				702	list_for_each_entry(node, &umem->umem_list, link) {
				703	unsigned long a = node->userspace_addr;
				704
				705	if (vhost_overflow(node->userspace_addr, node->size))
				706	return false;
				707
				708
				709	if (!access_ok(VERIFY_WRITE, (void __user *)a,
				710	node->size))
				711	return false;
				712	else if (log_all && !log_access_ok(log_base,
				713	node->start,
				714	node->size))
				715	return false;
				716	}
				717	return true;
				718	}
				719
				720	static inline void __user vhost_vq_meta_fetch(struct vhost_virtqueue vq,
				721	u64 addr, unsigned int size,
				722	int type)
				723	{
				724	const struct vhost_umem_node *node = vq->meta_iotlb[type];
				725
				726	if (!node)
				727	return NULL;
				728
				729	return (void *)(uintptr_t)(node->userspace_addr + addr - node->start);
				730	}
				731
				732	/* Can we switch to this memory table? */
				733	/* Caller should have device mutex but not vq mutex */
				734	static bool memory_access_ok(struct vhost_dev d, struct vhost_umem umem,
				735	int log_all)
				736	{
				737	int i;
				738
				739	for (i = 0; i < d->nvqs; ++i) {
				740	bool ok;
				741	bool log;
				742
				743	mutex_lock(&d->vqs[i]->mutex);
				744	log = log_all \|\| vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
				745	/* If ring is inactive, will check when it's enabled. */
				746	if (d->vqs[i]->private_data)
				747	ok = vq_memory_access_ok(d->vqs[i]->log_base,
				748	umem, log);
				749	else
				750	ok = true;
				751	mutex_unlock(&d->vqs[i]->mutex);
				752	if (!ok)
				753	return false;
				754	}
				755	return true;
				756	}
				757
				758	static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
				759	struct iovec iov[], int iov_size, int access);
				760
				761	static int vhost_copy_to_user(struct vhost_virtqueue vq, void __user to,
				762	const void *from, unsigned size)
				763	{
				764	int ret;
				765
				766	if (!vq->iotlb)
				767	return __copy_to_user(to, from, size);
				768	else {
				769	/* This function should be called after iotlb
				770	* prefetch, which means we're sure that all vq
				771	* could be access through iotlb. So -EAGAIN should
				772	* not happen in this case.
				773	*/
				774	struct iov_iter t;
				775	void __user *uaddr = vhost_vq_meta_fetch(vq,
				776	(u64)(uintptr_t)to, size,
				777	VHOST_ADDR_USED);
				778
				779	if (uaddr)
				780	return __copy_to_user(uaddr, from, size);
				781
				782	ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
				783	ARRAY_SIZE(vq->iotlb_iov),
				784	VHOST_ACCESS_WO);
				785	if (ret < 0)
				786	goto out;
				787	iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
				788	ret = copy_to_iter(from, size, &t);
				789	if (ret == size)
				790	ret = 0;
				791	}
				792	out:
				793	return ret;
				794	}
				795
				796	static int vhost_copy_from_user(struct vhost_virtqueue vq, void to,
				797	void __user *from, unsigned size)
				798	{
				799	int ret;
				800
				801	if (!vq->iotlb)
				802	return __copy_from_user(to, from, size);
				803	else {
				804	/* This function should be called after iotlb
				805	* prefetch, which means we're sure that vq
				806	* could be access through iotlb. So -EAGAIN should
				807	* not happen in this case.
				808	*/
				809	void __user *uaddr = vhost_vq_meta_fetch(vq,
				810	(u64)(uintptr_t)from, size,
				811	VHOST_ADDR_DESC);
				812	struct iov_iter f;
				813
				814	if (uaddr)
				815	return __copy_from_user(to, uaddr, size);
				816
				817	ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
				818	ARRAY_SIZE(vq->iotlb_iov),
				819	VHOST_ACCESS_RO);
				820	if (ret < 0) {
				821	vq_err(vq, "IOTLB translation failure: uaddr "
				822	"%p size 0x%llx\n", from,
				823	(unsigned long long) size);
				824	goto out;
				825	}
				826	iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
				827	ret = copy_from_iter(to, size, &f);
				828	if (ret == size)
				829	ret = 0;
				830	}
				831
				832	out:
				833	return ret;
				834	}
				835
				836	static void __user __vhost_get_user_slow(struct vhost_virtqueue vq,
				837	void __user *addr, unsigned int size,
				838	int type)
				839	{
				840	int ret;
				841
				842	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
				843	ARRAY_SIZE(vq->iotlb_iov),
				844	VHOST_ACCESS_RO);
				845	if (ret < 0) {
				846	vq_err(vq, "IOTLB translation failure: uaddr "
				847	"%p size 0x%llx\n", addr,
				848	(unsigned long long) size);
				849	return NULL;
				850	}
				851
				852	if (ret != 1 \|\| vq->iotlb_iov[0].iov_len != size) {
				853	vq_err(vq, "Non atomic userspace memory access: uaddr "
				854	"%p size 0x%llx\n", addr,
				855	(unsigned long long) size);
				856	return NULL;
				857	}
				858
				859	return vq->iotlb_iov[0].iov_base;
				860	}
				861
				862	/* This function should be called after iotlb
				863	* prefetch, which means we're sure that vq
				864	* could be access through iotlb. So -EAGAIN should
				865	* not happen in this case.
				866	*/
				867	static inline void __user __vhost_get_user(struct vhost_virtqueue vq,
				868	void *addr, unsigned int size,
				869	int type)
				870	{
				871	void __user *uaddr = vhost_vq_meta_fetch(vq,
				872	(u64)(uintptr_t)addr, size, type);
				873	if (uaddr)
				874	return uaddr;
				875
				876	return __vhost_get_user_slow(vq, addr, size, type);
				877	}
				878
				879	#define vhost_put_user(vq, x, ptr) \
				880	({ \
				881	int ret = -EFAULT; \
				882	if (!vq->iotlb) { \
				883	ret = __put_user(x, ptr); \
				884	} else { \
				885	__typeof__(ptr) to = \
				886	(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
				887	sizeof(*ptr), VHOST_ADDR_USED); \
				888	if (to != NULL) \
				889	ret = __put_user(x, to); \
				890	else \
				891	ret = -EFAULT; \
				892	} \
				893	ret; \
				894	})
				895
				896	#define vhost_get_user(vq, x, ptr, type) \
				897	({ \
				898	int ret; \
				899	if (!vq->iotlb) { \
				900	ret = __get_user(x, ptr); \
				901	} else { \
				902	__typeof__(ptr) from = \
				903	(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
				904	sizeof(*ptr), \
				905	type); \
				906	if (from != NULL) \
				907	ret = __get_user(x, from); \
				908	else \
				909	ret = -EFAULT; \
				910	} \
				911	ret; \
				912	})
				913
				914	#define vhost_get_avail(vq, x, ptr) \
				915	vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
				916
				917	#define vhost_get_used(vq, x, ptr) \
				918	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
				919
				920	static void vhost_dev_lock_vqs(struct vhost_dev *d)
				921	{
				922	int i = 0;
				923	for (i = 0; i < d->nvqs; ++i)
				924	mutex_lock_nested(&d->vqs[i]->mutex, i);
				925	}
				926
				927	static void vhost_dev_unlock_vqs(struct vhost_dev *d)
				928	{
				929	int i = 0;
				930	for (i = 0; i < d->nvqs; ++i)
				931	mutex_unlock(&d->vqs[i]->mutex);
				932	}
				933
				934	static int vhost_new_umem_range(struct vhost_umem *umem,
				935	u64 start, u64 size, u64 end,
				936	u64 userspace_addr, int perm)
				937	{
				938	struct vhost_umem_node tmp, node;
				939
				940	if (!size)
				941	return -EFAULT;
				942
				943	node = kmalloc(sizeof(*node), GFP_ATOMIC);
				944	if (!node)
				945	return -ENOMEM;
				946
				947	if (umem->numem == max_iotlb_entries) {
				948	tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
				949	vhost_umem_free(umem, tmp);
				950	}
				951
				952	node->start = start;
				953	node->size = size;
				954	node->last = end;
				955	node->userspace_addr = userspace_addr;
				956	node->perm = perm;
				957	INIT_LIST_HEAD(&node->link);
				958	list_add_tail(&node->link, &umem->umem_list);
				959	vhost_umem_interval_tree_insert(node, &umem->umem_tree);
				960	umem->numem++;
				961
				962	return 0;
				963	}
				964
				965	static void vhost_del_umem_range(struct vhost_umem *umem,
				966	u64 start, u64 end)
				967	{
				968	struct vhost_umem_node *node;
				969
				970	while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
				971	start, end)))
				972	vhost_umem_free(umem, node);
				973	}
				974
				975	static void vhost_iotlb_notify_vq(struct vhost_dev *d,
				976	struct vhost_iotlb_msg *msg)
				977	{
				978	struct vhost_msg_node node, n;
				979
				980	spin_lock(&d->iotlb_lock);
				981
				982	list_for_each_entry_safe(node, n, &d->pending_list, node) {
				983	struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
				984	if (msg->iova <= vq_msg->iova &&
				985	msg->iova + msg->size - 1 >= vq_msg->iova &&
				986	vq_msg->type == VHOST_IOTLB_MISS) {
				987	vhost_poll_queue(&node->vq->poll);
				988	list_del(&node->node);
				989	kfree(node);
				990	}
				991	}
				992
				993	spin_unlock(&d->iotlb_lock);
				994	}
				995
				996	static bool umem_access_ok(u64 uaddr, u64 size, int access)
				997	{
				998	unsigned long a = uaddr;
				999
				1000	/* Make sure 64 bit math will not overflow. */
				1001	if (vhost_overflow(uaddr, size))
				1002	return false;
				1003
				1004	if ((access & VHOST_ACCESS_RO) &&
				1005	!access_ok(VERIFY_READ, (void __user *)a, size))
				1006	return false;
				1007	if ((access & VHOST_ACCESS_WO) &&
				1008	!access_ok(VERIFY_WRITE, (void __user *)a, size))
				1009	return false;
				1010	return true;
				1011	}
				1012
				1013	static int vhost_process_iotlb_msg(struct vhost_dev *dev,
				1014	struct vhost_iotlb_msg *msg)
				1015	{
				1016	int ret = 0;
				1017
				1018	mutex_lock(&dev->mutex);
				1019	vhost_dev_lock_vqs(dev);
				1020	switch (msg->type) {
				1021	case VHOST_IOTLB_UPDATE:
				1022	if (!dev->iotlb) {
				1023	ret = -EFAULT;
				1024	break;
				1025	}
				1026	if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
				1027	ret = -EFAULT;
				1028	break;
				1029	}
				1030	vhost_vq_meta_reset(dev);
				1031	if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
				1032	msg->iova + msg->size - 1,
				1033	msg->uaddr, msg->perm)) {
				1034	ret = -ENOMEM;
				1035	break;
				1036	}
				1037	vhost_iotlb_notify_vq(dev, msg);
				1038	break;
				1039	case VHOST_IOTLB_INVALIDATE:
				1040	if (!dev->iotlb) {
				1041	ret = -EFAULT;
				1042	break;
				1043	}
				1044	vhost_vq_meta_reset(dev);
				1045	vhost_del_umem_range(dev->iotlb, msg->iova,
				1046	msg->iova + msg->size - 1);
				1047	break;
				1048	default:
				1049	ret = -EINVAL;
				1050	break;
				1051	}
				1052
				1053	vhost_dev_unlock_vqs(dev);
				1054	mutex_unlock(&dev->mutex);
				1055
				1056	return ret;
				1057	}
				1058	ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
				1059	struct iov_iter *from)
				1060	{
				1061	struct vhost_iotlb_msg msg;
				1062	size_t offset;
				1063	int type, ret;
				1064
				1065	ret = copy_from_iter(&type, sizeof(type), from);
				1066	if (ret != sizeof(type)) {
				1067	ret = -EINVAL;
				1068	goto done;
				1069	}
				1070
				1071	switch (type) {
				1072	case VHOST_IOTLB_MSG:
				1073	/* There maybe a hole after type for V1 message type,
				1074	* so skip it here.
				1075	*/
				1076	offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
				1077	break;
				1078	case VHOST_IOTLB_MSG_V2:
				1079	offset = sizeof(__u32);
				1080	break;
				1081	default:
				1082	ret = -EINVAL;
				1083	goto done;
				1084	}
				1085
				1086	iov_iter_advance(from, offset);
				1087	ret = copy_from_iter(&msg, sizeof(msg), from);
				1088	if (ret != sizeof(msg)) {
				1089	ret = -EINVAL;
				1090	goto done;
				1091	}
				1092	if (vhost_process_iotlb_msg(dev, &msg)) {
				1093	ret = -EFAULT;
				1094	goto done;
				1095	}
				1096
				1097	ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
				1098	sizeof(struct vhost_msg_v2);
				1099	done:
				1100	return ret;
				1101	}
				1102	EXPORT_SYMBOL(vhost_chr_write_iter);
				1103
				1104	__poll_t vhost_chr_poll(struct file file, struct vhost_dev dev,
				1105	poll_table *wait)
				1106	{
				1107	__poll_t mask = 0;
				1108
				1109	poll_wait(file, &dev->wait, wait);
				1110
				1111	if (!list_empty(&dev->read_list))
				1112	mask \|= EPOLLIN \| EPOLLRDNORM;
				1113
				1114	return mask;
				1115	}
				1116	EXPORT_SYMBOL(vhost_chr_poll);
				1117
				1118	ssize_t vhost_chr_read_iter(struct vhost_dev dev, struct iov_iter to,
				1119	int noblock)
				1120	{
				1121	DEFINE_WAIT(wait);
				1122	struct vhost_msg_node *node;
				1123	ssize_t ret = 0;
				1124	unsigned size = sizeof(struct vhost_msg);
				1125
				1126	if (iov_iter_count(to) < size)
				1127	return 0;
				1128
				1129	while (1) {
				1130	if (!noblock)
				1131	prepare_to_wait(&dev->wait, &wait,
				1132	TASK_INTERRUPTIBLE);
				1133
				1134	node = vhost_dequeue_msg(dev, &dev->read_list);
				1135	if (node)
				1136	break;
				1137	if (noblock) {
				1138	ret = -EAGAIN;
				1139	break;
				1140	}
				1141	if (signal_pending(current)) {
				1142	ret = -ERESTARTSYS;
				1143	break;
				1144	}
				1145	if (!dev->iotlb) {
				1146	ret = -EBADFD;
				1147	break;
				1148	}
				1149
				1150	schedule();
				1151	}
				1152
				1153	if (!noblock)
				1154	finish_wait(&dev->wait, &wait);
				1155
				1156	if (node) {
				1157	struct vhost_iotlb_msg *msg;
				1158	void *start = &node->msg;
				1159
				1160	switch (node->msg.type) {
				1161	case VHOST_IOTLB_MSG:
				1162	size = sizeof(node->msg);
				1163	msg = &node->msg.iotlb;
				1164	break;
				1165	case VHOST_IOTLB_MSG_V2:
				1166	size = sizeof(node->msg_v2);
				1167	msg = &node->msg_v2.iotlb;
				1168	break;
				1169	default:
				1170	BUG();
				1171	break;
				1172	}
				1173
				1174	ret = copy_to_iter(start, size, to);
				1175	if (ret != size \|\| msg->type != VHOST_IOTLB_MISS) {
				1176	kfree(node);
				1177	return ret;
				1178	}
				1179	vhost_enqueue_msg(dev, &dev->pending_list, node);
				1180	}
				1181
				1182	return ret;
				1183	}
				1184	EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
				1185
				1186	static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
				1187	{
				1188	struct vhost_dev *dev = vq->dev;
				1189	struct vhost_msg_node *node;
				1190	struct vhost_iotlb_msg *msg;
				1191	bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
				1192
				1193	node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
				1194	if (!node)
				1195	return -ENOMEM;
				1196
				1197	if (v2) {
				1198	node->msg_v2.type = VHOST_IOTLB_MSG_V2;
				1199	msg = &node->msg_v2.iotlb;
				1200	} else {
				1201	msg = &node->msg.iotlb;
				1202	}
				1203
				1204	msg->type = VHOST_IOTLB_MISS;
				1205	msg->iova = iova;
				1206	msg->perm = access;
				1207
				1208	vhost_enqueue_msg(dev, &dev->read_list, node);
				1209
				1210	return 0;
				1211	}
				1212
				1213	static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
				1214	struct vring_desc __user *desc,
				1215	struct vring_avail __user *avail,
				1216	struct vring_used __user *used)
				1217
				1218	{
				1219	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
				1220
				1221	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
				1222	access_ok(VERIFY_READ, avail,
				1223	sizeof avail + num sizeof *avail->ring + s) &&
				1224	access_ok(VERIFY_WRITE, used,
				1225	sizeof used + num sizeof *used->ring + s);
				1226	}
				1227
				1228	static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
				1229	const struct vhost_umem_node *node,
				1230	int type)
				1231	{
				1232	int access = (type == VHOST_ADDR_USED) ?
				1233	VHOST_ACCESS_WO : VHOST_ACCESS_RO;
				1234
				1235	if (likely(node->perm & access))
				1236	vq->meta_iotlb[type] = node;
				1237	}
				1238
				1239	static bool iotlb_access_ok(struct vhost_virtqueue *vq,
				1240	int access, u64 addr, u64 len, int type)
				1241	{
				1242	const struct vhost_umem_node *node;
				1243	struct vhost_umem *umem = vq->iotlb;
				1244	u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
				1245
				1246	if (vhost_vq_meta_fetch(vq, addr, len, type))
				1247	return true;
				1248
				1249	while (len > s) {
				1250	node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
				1251	addr,
				1252	last);
				1253	if (node == NULL \|\| node->start > addr) {
				1254	vhost_iotlb_miss(vq, addr, access);
				1255	return false;
				1256	} else if (!(node->perm & access)) {
				1257	/* Report the possible access violation by
				1258	* request another translation from userspace.
				1259	*/
				1260	return false;
				1261	}
				1262
				1263	size = node->size - addr + node->start;
				1264
				1265	if (orig_addr == addr && size >= len)
				1266	vhost_vq_meta_update(vq, node, type);
				1267
				1268	s += size;
				1269	addr += size;
				1270	}
				1271
				1272	return true;
				1273	}
				1274
				1275	int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
				1276	{
				1277	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
				1278	unsigned int num = vq->num;
				1279
				1280	if (!vq->iotlb)
				1281	return 1;
				1282
				1283	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
				1284	num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
				1285	iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
				1286	sizeof *vq->avail +
				1287	num * sizeof(*vq->avail->ring) + s,
				1288	VHOST_ADDR_AVAIL) &&
				1289	iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
				1290	sizeof *vq->used +
				1291	num * sizeof(*vq->used->ring) + s,
				1292	VHOST_ADDR_USED);
				1293	}
				1294	EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
				1295
				1296	/* Can we log writes? */
				1297	/* Caller should have device mutex but not vq mutex */
				1298	bool vhost_log_access_ok(struct vhost_dev *dev)
				1299	{
				1300	return memory_access_ok(dev, dev->umem, 1);
				1301	}
				1302	EXPORT_SYMBOL_GPL(vhost_log_access_ok);
				1303
				1304	/* Verify access for write logging. */
				1305	/* Caller should have vq mutex and device mutex */
				1306	static bool vq_log_access_ok(struct vhost_virtqueue *vq,
				1307	void __user *log_base)
				1308	{
				1309	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
				1310
				1311	return vq_memory_access_ok(log_base, vq->umem,
				1312	vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
				1313	(!vq->log_used \|\| log_access_ok(log_base, vq->log_addr,
				1314	sizeof *vq->used +
				1315	vq->num * sizeof *vq->used->ring + s));
				1316	}
				1317
				1318	/* Can we start vq? */
				1319	/* Caller should have vq mutex and device mutex */
				1320	bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
				1321	{
				1322	if (!vq_log_access_ok(vq, vq->log_base))
				1323	return false;
				1324
				1325	/* Access validation occurs at prefetch time with IOTLB */
				1326	if (vq->iotlb)
				1327	return true;
				1328
				1329	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
				1330	}
				1331	EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
				1332
				1333	static struct vhost_umem *vhost_umem_alloc(void)
				1334	{
				1335	struct vhost_umem umem = kvzalloc(sizeof(umem), GFP_KERNEL);
				1336
				1337	if (!umem)
				1338	return NULL;
				1339
				1340	umem->umem_tree = RB_ROOT_CACHED;
				1341	umem->numem = 0;
				1342	INIT_LIST_HEAD(&umem->umem_list);
				1343
				1344	return umem;
				1345	}
				1346
				1347	static long vhost_set_memory(struct vhost_dev d, struct vhost_memory __user m)
				1348	{
				1349	struct vhost_memory mem, *newmem;
				1350	struct vhost_memory_region *region;
				1351	struct vhost_umem newumem, oldumem;
				1352	unsigned long size = offsetof(struct vhost_memory, regions);
				1353	int i;
				1354
				1355	if (copy_from_user(&mem, m, size))
				1356	return -EFAULT;
				1357	if (mem.padding)
				1358	return -EOPNOTSUPP;
				1359	if (mem.nregions > max_mem_regions)
				1360	return -E2BIG;
				1361	newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
				1362	GFP_KERNEL);
				1363	if (!newmem)
				1364	return -ENOMEM;
				1365
				1366	memcpy(newmem, &mem, size);
				1367	if (copy_from_user(newmem->regions, m->regions,
				1368	mem.nregions * sizeof *m->regions)) {
				1369	kvfree(newmem);
				1370	return -EFAULT;
				1371	}
				1372
				1373	newumem = vhost_umem_alloc();
				1374	if (!newumem) {
				1375	kvfree(newmem);
				1376	return -ENOMEM;
				1377	}
				1378
				1379	for (region = newmem->regions;
				1380	region < newmem->regions + mem.nregions;
				1381	region++) {
				1382	if (vhost_new_umem_range(newumem,
				1383	region->guest_phys_addr,
				1384	region->memory_size,
				1385	region->guest_phys_addr +
				1386	region->memory_size - 1,
				1387	region->userspace_addr,
				1388	VHOST_ACCESS_RW))
				1389	goto err;
				1390	}
				1391
				1392	if (!memory_access_ok(d, newumem, 0))
				1393	goto err;
				1394
				1395	oldumem = d->umem;
				1396	d->umem = newumem;
				1397
				1398	/* All memory accesses are done under some VQ mutex. */
				1399	for (i = 0; i < d->nvqs; ++i) {
				1400	mutex_lock(&d->vqs[i]->mutex);
				1401	d->vqs[i]->umem = newumem;
				1402	mutex_unlock(&d->vqs[i]->mutex);
				1403	}
				1404
				1405	kvfree(newmem);
				1406	vhost_umem_clean(oldumem);
				1407	return 0;
				1408
				1409	err:
				1410	vhost_umem_clean(newumem);
				1411	kvfree(newmem);
				1412	return -EFAULT;
				1413	}
				1414
				1415	long vhost_vring_ioctl(struct vhost_dev d, unsigned int ioctl, void __user argp)
				1416	{
				1417	struct file eventfp, filep = NULL;
				1418	bool pollstart = false, pollstop = false;
				1419	struct eventfd_ctx *ctx = NULL;
				1420	u32 __user *idxp = argp;
				1421	struct vhost_virtqueue *vq;
				1422	struct vhost_vring_state s;
				1423	struct vhost_vring_file f;
				1424	struct vhost_vring_addr a;
				1425	u32 idx;
				1426	long r;
				1427
				1428	r = get_user(idx, idxp);
				1429	if (r < 0)
				1430	return r;
				1431	if (idx >= d->nvqs)
				1432	return -ENOBUFS;
				1433
				1434	idx = array_index_nospec(idx, d->nvqs);
				1435	vq = d->vqs[idx];
				1436
				1437	mutex_lock(&vq->mutex);
				1438
				1439	switch (ioctl) {
				1440	case VHOST_SET_VRING_NUM:
				1441	/* Resizing ring with an active backend?
				1442	* You don't want to do that. */
				1443	if (vq->private_data) {
				1444	r = -EBUSY;
				1445	break;
				1446	}
				1447	if (copy_from_user(&s, argp, sizeof s)) {
				1448	r = -EFAULT;
				1449	break;
				1450	}
				1451	if (!s.num \|\| s.num > 0xffff \|\| (s.num & (s.num - 1))) {
				1452	r = -EINVAL;
				1453	break;
				1454	}
				1455	vq->num = s.num;
				1456	break;
				1457	case VHOST_SET_VRING_BASE:
				1458	/* Moving base with an active backend?
				1459	* You don't want to do that. */
				1460	if (vq->private_data) {
				1461	r = -EBUSY;
				1462	break;
				1463	}
				1464	if (copy_from_user(&s, argp, sizeof s)) {
				1465	r = -EFAULT;
				1466	break;
				1467	}
				1468	if (s.num > 0xffff) {
				1469	r = -EINVAL;
				1470	break;
				1471	}
				1472	vq->last_avail_idx = s.num;
				1473	/* Forget the cached index value. */
				1474	vq->avail_idx = vq->last_avail_idx;
				1475	break;
				1476	case VHOST_GET_VRING_BASE:
				1477	s.index = idx;
				1478	s.num = vq->last_avail_idx;
				1479	if (copy_to_user(argp, &s, sizeof s))
				1480	r = -EFAULT;
				1481	break;
				1482	case VHOST_SET_VRING_ADDR:
				1483	if (copy_from_user(&a, argp, sizeof a)) {
				1484	r = -EFAULT;
				1485	break;
				1486	}
				1487	if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) {
				1488	r = -EOPNOTSUPP;
				1489	break;
				1490	}
				1491	/* For 32bit, verify that the top 32bits of the user
				1492	data are set to zero. */
				1493	if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr \|\|
				1494	(u64)(unsigned long)a.used_user_addr != a.used_user_addr \|\|
				1495	(u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) {
				1496	r = -EFAULT;
				1497	break;
				1498	}
				1499
				1500	/* Make sure it's safe to cast pointers to vring types. */
				1501	BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
				1502	BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
				1503	if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) \|\|
				1504	(a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) \|\|
				1505	(a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) {
				1506	r = -EINVAL;
				1507	break;
				1508	}
				1509
				1510	/* We only verify access here if backend is configured.
				1511	* If it is not, we don't as size might not have been setup.
				1512	* We will verify when backend is configured. */
				1513	if (vq->private_data) {
				1514	if (!vq_access_ok(vq, vq->num,
				1515	(void __user *)(unsigned long)a.desc_user_addr,
				1516	(void __user *)(unsigned long)a.avail_user_addr,
				1517	(void __user *)(unsigned long)a.used_user_addr)) {
				1518	r = -EINVAL;
				1519	break;
				1520	}
				1521
				1522	/* Also validate log access for used ring if enabled. */
				1523	if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
				1524	!log_access_ok(vq->log_base, a.log_guest_addr,
				1525	sizeof *vq->used +
				1526	vq->num * sizeof *vq->used->ring)) {
				1527	r = -EINVAL;
				1528	break;
				1529	}
				1530	}
				1531
				1532	vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
				1533	vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
				1534	vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
				1535	vq->log_addr = a.log_guest_addr;
				1536	vq->used = (void __user *)(unsigned long)a.used_user_addr;
				1537	break;
				1538	case VHOST_SET_VRING_KICK:
				1539	if (copy_from_user(&f, argp, sizeof f)) {
				1540	r = -EFAULT;
				1541	break;
				1542	}
				1543	eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
				1544	if (IS_ERR(eventfp)) {
				1545	r = PTR_ERR(eventfp);
				1546	break;
				1547	}
				1548	if (eventfp != vq->kick) {
				1549	pollstop = (filep = vq->kick) != NULL;
				1550	pollstart = (vq->kick = eventfp) != NULL;
				1551	} else
				1552	filep = eventfp;
				1553	break;
				1554	case VHOST_SET_VRING_CALL:
				1555	if (copy_from_user(&f, argp, sizeof f)) {
				1556	r = -EFAULT;
				1557	break;
				1558	}
				1559	ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd);
				1560	if (IS_ERR(ctx)) {
				1561	r = PTR_ERR(ctx);
				1562	break;
				1563	}
				1564	swap(ctx, vq->call_ctx);
				1565	break;
				1566	case VHOST_SET_VRING_ERR:
				1567	if (copy_from_user(&f, argp, sizeof f)) {
				1568	r = -EFAULT;
				1569	break;
				1570	}
				1571	ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd);
				1572	if (IS_ERR(ctx)) {
				1573	r = PTR_ERR(ctx);
				1574	break;
				1575	}
				1576	swap(ctx, vq->error_ctx);
				1577	break;
				1578	case VHOST_SET_VRING_ENDIAN:
				1579	r = vhost_set_vring_endian(vq, argp);
				1580	break;
				1581	case VHOST_GET_VRING_ENDIAN:
				1582	r = vhost_get_vring_endian(vq, idx, argp);
				1583	break;
				1584	case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
				1585	if (copy_from_user(&s, argp, sizeof(s))) {
				1586	r = -EFAULT;
				1587	break;
				1588	}
				1589	vq->busyloop_timeout = s.num;
				1590	break;
				1591	case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
				1592	s.index = idx;
				1593	s.num = vq->busyloop_timeout;
				1594	if (copy_to_user(argp, &s, sizeof(s)))
				1595	r = -EFAULT;
				1596	break;
				1597	default:
				1598	r = -ENOIOCTLCMD;
				1599	}
				1600
				1601	if (pollstop && vq->handle_kick)
				1602	vhost_poll_stop(&vq->poll);
				1603
				1604	if (!IS_ERR_OR_NULL(ctx))
				1605	eventfd_ctx_put(ctx);
				1606	if (filep)
				1607	fput(filep);
				1608
				1609	if (pollstart && vq->handle_kick)
				1610	r = vhost_poll_start(&vq->poll, vq->kick);
				1611
				1612	mutex_unlock(&vq->mutex);
				1613
				1614	if (pollstop && vq->handle_kick)
				1615	vhost_poll_flush(&vq->poll);
				1616	return r;
				1617	}
				1618	EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
				1619
				1620	int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
				1621	{
				1622	struct vhost_umem niotlb, oiotlb;
				1623	int i;
				1624
				1625	niotlb = vhost_umem_alloc();
				1626	if (!niotlb)
				1627	return -ENOMEM;
				1628
				1629	oiotlb = d->iotlb;
				1630	d->iotlb = niotlb;
				1631
				1632	for (i = 0; i < d->nvqs; ++i) {
				1633	struct vhost_virtqueue *vq = d->vqs[i];
				1634
				1635	mutex_lock(&vq->mutex);
				1636	vq->iotlb = niotlb;
				1637	__vhost_vq_meta_reset(vq);
				1638	mutex_unlock(&vq->mutex);
				1639	}
				1640
				1641	vhost_umem_clean(oiotlb);
				1642
				1643	return 0;
				1644	}
				1645	EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
				1646
				1647	/* Caller must have device mutex */
				1648	long vhost_dev_ioctl(struct vhost_dev d, unsigned int ioctl, void __user argp)
				1649	{
				1650	struct eventfd_ctx *ctx;
				1651	u64 p;
				1652	long r;
				1653	int i, fd;
				1654
				1655	/* If you are not the owner, you can become one */
				1656	if (ioctl == VHOST_SET_OWNER) {
				1657	r = vhost_dev_set_owner(d);
				1658	goto done;
				1659	}
				1660
				1661	/* You must be the owner to do anything else */
				1662	r = vhost_dev_check_owner(d);
				1663	if (r)
				1664	goto done;
				1665
				1666	switch (ioctl) {
				1667	case VHOST_SET_MEM_TABLE:
				1668	r = vhost_set_memory(d, argp);
				1669	break;
				1670	case VHOST_SET_LOG_BASE:
				1671	if (copy_from_user(&p, argp, sizeof p)) {
				1672	r = -EFAULT;
				1673	break;
				1674	}
				1675	if ((u64)(unsigned long)p != p) {
				1676	r = -EFAULT;
				1677	break;
				1678	}
				1679	for (i = 0; i < d->nvqs; ++i) {
				1680	struct vhost_virtqueue *vq;
				1681	void __user base = (void __user )(unsigned long)p;
				1682	vq = d->vqs[i];
				1683	mutex_lock(&vq->mutex);
				1684	/* If ring is inactive, will check when it's enabled. */
				1685	if (vq->private_data && !vq_log_access_ok(vq, base))
				1686	r = -EFAULT;
				1687	else
				1688	vq->log_base = base;
				1689	mutex_unlock(&vq->mutex);
				1690	}
				1691	break;
				1692	case VHOST_SET_LOG_FD:
				1693	r = get_user(fd, (int __user *)argp);
				1694	if (r < 0)
				1695	break;
				1696	ctx = fd == -1 ? NULL : eventfd_ctx_fdget(fd);
				1697	if (IS_ERR(ctx)) {
				1698	r = PTR_ERR(ctx);
				1699	break;
				1700	}
				1701	swap(ctx, d->log_ctx);
				1702	for (i = 0; i < d->nvqs; ++i) {
				1703	mutex_lock(&d->vqs[i]->mutex);
				1704	d->vqs[i]->log_ctx = d->log_ctx;
				1705	mutex_unlock(&d->vqs[i]->mutex);
				1706	}
				1707	if (ctx)
				1708	eventfd_ctx_put(ctx);
				1709	break;
				1710	default:
				1711	r = -ENOIOCTLCMD;
				1712	break;
				1713	}
				1714	done:
				1715	return r;
				1716	}
				1717	EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
				1718
				1719	/* TODO: This is really inefficient. We need something like get_user()
				1720	* (instruction directly accesses the data, with an exception table entry
				1721	* returning -EFAULT). See Documentation/x86/exception-tables.txt.
				1722	*/
				1723	static int set_bit_to_user(int nr, void __user *addr)
				1724	{
				1725	unsigned long log = (unsigned long)addr;
				1726	struct page *page;
				1727	void *base;
				1728	int bit = nr + (log % PAGE_SIZE) * 8;
				1729	int r;
				1730
				1731	r = get_user_pages_fast(log, 1, 1, &page);
				1732	if (r < 0)
				1733	return r;
				1734	BUG_ON(r != 1);
				1735	base = kmap_atomic(page);
				1736	set_bit(bit, base);
				1737	kunmap_atomic(base);
				1738	set_page_dirty_lock(page);
				1739	put_page(page);
				1740	return 0;
				1741	}
				1742
				1743	static int log_write(void __user *log_base,
				1744	u64 write_address, u64 write_length)
				1745	{
				1746	u64 write_page = write_address / VHOST_PAGE_SIZE;
				1747	int r;
				1748
				1749	if (!write_length)
				1750	return 0;
				1751	write_length += write_address % VHOST_PAGE_SIZE;
				1752	for (;;) {
				1753	u64 base = (u64)(unsigned long)log_base;
				1754	u64 log = base + write_page / 8;
				1755	int bit = write_page % 8;
				1756	if ((u64)(unsigned long)log != log)
				1757	return -EFAULT;
				1758	r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
				1759	if (r < 0)
				1760	return r;
				1761	if (write_length <= VHOST_PAGE_SIZE)
				1762	break;
				1763	write_length -= VHOST_PAGE_SIZE;
				1764	write_page += 1;
				1765	}
				1766	return r;
				1767	}
				1768
				1769	static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
				1770	{
				1771	struct vhost_umem *umem = vq->umem;
				1772	struct vhost_umem_node *u;
				1773	u64 start, end, l, min;
				1774	int r;
				1775	bool hit = false;
				1776
				1777	while (len) {
				1778	min = len;
				1779	/* More than one GPAs can be mapped into a single HVA. So
				1780	* iterate all possible umems here to be safe.
				1781	*/
				1782	list_for_each_entry(u, &umem->umem_list, link) {
				1783	if (u->userspace_addr > hva - 1 + len \|\|
				1784	u->userspace_addr - 1 + u->size < hva)
				1785	continue;
				1786	start = max(u->userspace_addr, hva);
				1787	end = min(u->userspace_addr - 1 + u->size,
				1788	hva - 1 + len);
				1789	l = end - start + 1;
				1790	r = log_write(vq->log_base,
				1791	u->start + start - u->userspace_addr,
				1792	l);
				1793	if (r < 0)
				1794	return r;
				1795	hit = true;
				1796	min = min(l, min);
				1797	}
				1798
				1799	if (!hit)
				1800	return -EFAULT;
				1801
				1802	len -= min;
				1803	hva += min;
				1804	}
				1805
				1806	return 0;
				1807	}
				1808
				1809	static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
				1810	{
				1811	struct iovec iov[64];
				1812	int i, ret;
				1813
				1814	if (!vq->iotlb)
				1815	return log_write(vq->log_base, vq->log_addr + used_offset, len);
				1816
				1817	ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
				1818	len, iov, 64, VHOST_ACCESS_WO);
				1819	if (ret < 0)
				1820	return ret;
				1821
				1822	for (i = 0; i < ret; i++) {
				1823	ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
				1824	iov[i].iov_len);
				1825	if (ret)
				1826	return ret;
				1827	}
				1828
				1829	return 0;
				1830	}
				1831
				1832	int vhost_log_write(struct vhost_virtqueue vq, struct vhost_log log,
				1833	unsigned int log_num, u64 len, struct iovec *iov, int count)
				1834	{
				1835	int i, r;
				1836
				1837	/* Make sure data written is seen before log. */
				1838	smp_wmb();
				1839
				1840	if (vq->iotlb) {
				1841	for (i = 0; i < count; i++) {
				1842	r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
				1843	iov[i].iov_len);
				1844	if (r < 0)
				1845	return r;
				1846	}
				1847	return 0;
				1848	}
				1849
				1850	for (i = 0; i < log_num; ++i) {
				1851	u64 l = min(log[i].len, len);
				1852	r = log_write(vq->log_base, log[i].addr, l);
				1853	if (r < 0)
				1854	return r;
				1855	len -= l;
				1856	if (!len) {
				1857	if (vq->log_ctx)
				1858	eventfd_signal(vq->log_ctx, 1);
				1859	return 0;
				1860	}
				1861	}
				1862	/* Length written exceeds what we have stored. This is a bug. */
				1863	BUG();
				1864	return 0;
				1865	}
				1866	EXPORT_SYMBOL_GPL(vhost_log_write);
				1867
				1868	static int vhost_update_used_flags(struct vhost_virtqueue *vq)
				1869	{
				1870	void __user *used;
				1871	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
				1872	&vq->used->flags) < 0)
				1873	return -EFAULT;
				1874	if (unlikely(vq->log_used)) {
				1875	/* Make sure the flag is seen before log. */
				1876	smp_wmb();
				1877	/* Log used flag write. */
				1878	used = &vq->used->flags;
				1879	log_used(vq, (used - (void __user *)vq->used),
				1880	sizeof vq->used->flags);
				1881	if (vq->log_ctx)
				1882	eventfd_signal(vq->log_ctx, 1);
				1883	}
				1884	return 0;
				1885	}
				1886
				1887	static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
				1888	{
				1889	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
				1890	vhost_avail_event(vq)))
				1891	return -EFAULT;
				1892	if (unlikely(vq->log_used)) {
				1893	void __user *used;
				1894	/* Make sure the event is seen before log. */
				1895	smp_wmb();
				1896	/* Log avail event write */
				1897	used = vhost_avail_event(vq);
				1898	log_used(vq, (used - (void __user *)vq->used),
				1899	sizeof *vhost_avail_event(vq));
				1900	if (vq->log_ctx)
				1901	eventfd_signal(vq->log_ctx, 1);
				1902	}
				1903	return 0;
				1904	}
				1905
				1906	int vhost_vq_init_access(struct vhost_virtqueue *vq)
				1907	{
				1908	__virtio16 last_used_idx;
				1909	int r;
				1910	bool is_le = vq->is_le;
				1911
				1912	if (!vq->private_data)
				1913	return 0;
				1914
				1915	vhost_init_is_le(vq);
				1916
				1917	r = vhost_update_used_flags(vq);
				1918	if (r)
				1919	goto err;
				1920	vq->signalled_used_valid = false;
				1921	if (!vq->iotlb &&
				1922	!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
				1923	r = -EFAULT;
				1924	goto err;
				1925	}
				1926	r = vhost_get_used(vq, last_used_idx, &vq->used->idx);
				1927	if (r) {
				1928	vq_err(vq, "Can't access used idx at %p\n",
				1929	&vq->used->idx);
				1930	goto err;
				1931	}
				1932	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
				1933	return 0;
				1934
				1935	err:
				1936	vq->is_le = is_le;
				1937	return r;
				1938	}
				1939	EXPORT_SYMBOL_GPL(vhost_vq_init_access);
				1940
				1941	static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
				1942	struct iovec iov[], int iov_size, int access)
				1943	{
				1944	const struct vhost_umem_node *node;
				1945	struct vhost_dev *dev = vq->dev;
				1946	struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
				1947	struct iovec *_iov;
				1948	u64 s = 0;
				1949	int ret = 0;
				1950
				1951	while ((u64)len > s) {
				1952	u64 size;
				1953	if (unlikely(ret >= iov_size)) {
				1954	ret = -ENOBUFS;
				1955	break;
				1956	}
				1957
				1958	node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
				1959	addr, addr + len - 1);
				1960	if (node == NULL \|\| node->start > addr) {
				1961	if (umem != dev->iotlb) {
				1962	ret = -EFAULT;
				1963	break;
				1964	}
				1965	ret = -EAGAIN;
				1966	break;
				1967	} else if (!(node->perm & access)) {
				1968	ret = -EPERM;
				1969	break;
				1970	}
				1971
				1972	_iov = iov + ret;
				1973	size = node->size - addr + node->start;
				1974	_iov->iov_len = min((u64)len - s, size);
				1975	_iov->iov_base = (void __user *)(unsigned long)
				1976	(node->userspace_addr + addr - node->start);
				1977	s += size;
				1978	addr += size;
				1979	++ret;
				1980	}
				1981
				1982	if (ret == -EAGAIN)
				1983	vhost_iotlb_miss(vq, addr, access);
				1984	return ret;
				1985	}
				1986
				1987	/* Each buffer in the virtqueues is actually a chain of descriptors. This
				1988	* function returns the next descriptor in the chain,
				1989	* or -1U if we're at the end. */
				1990	static unsigned next_desc(struct vhost_virtqueue vq, struct vring_desc desc)
				1991	{
				1992	unsigned int next;
				1993
				1994	/* If this descriptor says it doesn't chain, we're done. */
				1995	if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
				1996	return -1U;
				1997
				1998	/* Check they're not leading us off end of descriptors. */
				1999	next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
				2000	return next;
				2001	}
				2002
				2003	static int get_indirect(struct vhost_virtqueue *vq,
				2004	struct iovec iov[], unsigned int iov_size,
				2005	unsigned int out_num, unsigned int in_num,
				2006	struct vhost_log log, unsigned int log_num,
				2007	struct vring_desc *indirect)
				2008	{
				2009	struct vring_desc desc;
				2010	unsigned int i = 0, count, found = 0;
				2011	u32 len = vhost32_to_cpu(vq, indirect->len);
				2012	struct iov_iter from;
				2013	int ret, access;
				2014
				2015	/* Sanity check */
				2016	if (unlikely(len % sizeof desc)) {
				2017	vq_err(vq, "Invalid length in indirect descriptor: "
				2018	"len 0x%llx not multiple of 0x%zx\n",
				2019	(unsigned long long)len,
				2020	sizeof desc);
				2021	return -EINVAL;
				2022	}
				2023
				2024	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
				2025	UIO_MAXIOV, VHOST_ACCESS_RO);
				2026	if (unlikely(ret < 0)) {
				2027	if (ret != -EAGAIN)
				2028	vq_err(vq, "Translation failure %d in indirect.\n", ret);
				2029	return ret;
				2030	}
				2031	iov_iter_init(&from, READ, vq->indirect, ret, len);
				2032
				2033	/* We will use the result as an address to read from, so most
				2034	* architectures only need a compiler barrier here. */
				2035	read_barrier_depends();
				2036
				2037	count = len / sizeof desc;
				2038	/* Buffers are chained via a 16 bit next field, so
				2039	* we can have at most 2^16 of these. */
				2040	if (unlikely(count > USHRT_MAX + 1)) {
				2041	vq_err(vq, "Indirect buffer length too big: %d\n",
				2042	indirect->len);
				2043	return -E2BIG;
				2044	}
				2045
				2046	do {
				2047	unsigned iov_count = in_num + out_num;
				2048	if (unlikely(++found > count)) {
				2049	vq_err(vq, "Loop detected: last one at %u "
				2050	"indirect size %u\n",
				2051	i, count);
				2052	return -EINVAL;
				2053	}
				2054	if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
				2055	vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
				2056	i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
				2057	return -EINVAL;
				2058	}
				2059	if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
				2060	vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
				2061	i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
				2062	return -EINVAL;
				2063	}
				2064
				2065	if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
				2066	access = VHOST_ACCESS_WO;
				2067	else
				2068	access = VHOST_ACCESS_RO;
				2069
				2070	ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
				2071	vhost32_to_cpu(vq, desc.len), iov + iov_count,
				2072	iov_size - iov_count, access);
				2073	if (unlikely(ret < 0)) {
				2074	if (ret != -EAGAIN)
				2075	vq_err(vq, "Translation failure %d indirect idx %d\n",
				2076	ret, i);
				2077	return ret;
				2078	}
				2079	/* If this is an input descriptor, increment that count. */
				2080	if (access == VHOST_ACCESS_WO) {
				2081	*in_num += ret;
				2082	if (unlikely(log && ret)) {
				2083	log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
				2084	log[*log_num].len = vhost32_to_cpu(vq, desc.len);
				2085	++*log_num;
				2086	}
				2087	} else {
				2088	/* If it's an output descriptor, they're all supposed
				2089	* to come before any input descriptors. */
				2090	if (unlikely(*in_num)) {
				2091	vq_err(vq, "Indirect descriptor "
				2092	"has out after in: idx %d\n", i);
				2093	return -EINVAL;
				2094	}
				2095	*out_num += ret;
				2096	}
				2097	} while ((i = next_desc(vq, &desc)) != -1);
				2098	return 0;
				2099	}
				2100
				2101	/* This looks in the virtqueue and for the first available buffer, and converts
				2102	* it to an iovec for convenient access. Since descriptors consist of some
				2103	* number of output then some number of input descriptors, it's actually two
				2104	* iovecs, but we pack them into one and note how many of each there were.
				2105	*
				2106	* This function returns the descriptor number found, or vq->num (which is
				2107	* never a valid descriptor number) if none was found. A negative code is
				2108	* returned on error. */
				2109	int vhost_get_vq_desc(struct vhost_virtqueue *vq,
				2110	struct iovec iov[], unsigned int iov_size,
				2111	unsigned int out_num, unsigned int in_num,
				2112	struct vhost_log log, unsigned int log_num)
				2113	{
				2114	struct vring_desc desc;
				2115	unsigned int i, head, found = 0;
				2116	u16 last_avail_idx;
				2117	__virtio16 avail_idx;
				2118	__virtio16 ring_head;
				2119	int ret, access;
				2120
				2121	/* Check it isn't doing very strange things with descriptor numbers. */
				2122	last_avail_idx = vq->last_avail_idx;
				2123
				2124	if (vq->avail_idx == vq->last_avail_idx) {
				2125	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
				2126	vq_err(vq, "Failed to access avail idx at %p\n",
				2127	&vq->avail->idx);
				2128	return -EFAULT;
				2129	}
				2130	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
				2131
				2132	if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
				2133	vq_err(vq, "Guest moved used index from %u to %u",
				2134	last_avail_idx, vq->avail_idx);
				2135	return -EFAULT;
				2136	}
				2137
				2138	/* If there's nothing new since last we looked, return
				2139	* invalid.
				2140	*/
				2141	if (vq->avail_idx == last_avail_idx)
				2142	return vq->num;
				2143
				2144	/* Only get avail ring entries after they have been
				2145	* exposed by guest.
				2146	*/
				2147	smp_rmb();
				2148	}
				2149
				2150	/* Grab the next descriptor number they're advertising, and increment
				2151	* the index we've seen. */
				2152	if (unlikely(vhost_get_avail(vq, ring_head,
				2153	&vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
				2154	vq_err(vq, "Failed to read head: idx %d address %p\n",
				2155	last_avail_idx,
				2156	&vq->avail->ring[last_avail_idx % vq->num]);
				2157	return -EFAULT;
				2158	}
				2159
				2160	head = vhost16_to_cpu(vq, ring_head);
				2161
				2162	/* If their number is silly, that's an error. */
				2163	if (unlikely(head >= vq->num)) {
				2164	vq_err(vq, "Guest says index %u > %u is available",
				2165	head, vq->num);
				2166	return -EINVAL;
				2167	}
				2168
				2169	/* When we start there are none of either input nor output. */
				2170	out_num = in_num = 0;
				2171	if (unlikely(log))
				2172	*log_num = 0;
				2173
				2174	i = head;
				2175	do {
				2176	unsigned iov_count = in_num + out_num;
				2177	if (unlikely(i >= vq->num)) {
				2178	vq_err(vq, "Desc index is %u > %u, head = %u",
				2179	i, vq->num, head);
				2180	return -EINVAL;
				2181	}
				2182	if (unlikely(++found > vq->num)) {
				2183	vq_err(vq, "Loop detected: last one at %u "
				2184	"vq size %u head %u\n",
				2185	i, vq->num, head);
				2186	return -EINVAL;
				2187	}
				2188	ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
				2189	sizeof desc);
				2190	if (unlikely(ret)) {
				2191	vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
				2192	i, vq->desc + i);
				2193	return -EFAULT;
				2194	}
				2195	if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
				2196	ret = get_indirect(vq, iov, iov_size,
				2197	out_num, in_num,
				2198	log, log_num, &desc);
				2199	if (unlikely(ret < 0)) {
				2200	if (ret != -EAGAIN)
				2201	vq_err(vq, "Failure detected "
				2202	"in indirect descriptor at idx %d\n", i);
				2203	return ret;
				2204	}
				2205	continue;
				2206	}
				2207
				2208	if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
				2209	access = VHOST_ACCESS_WO;
				2210	else
				2211	access = VHOST_ACCESS_RO;
				2212	ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
				2213	vhost32_to_cpu(vq, desc.len), iov + iov_count,
				2214	iov_size - iov_count, access);
				2215	if (unlikely(ret < 0)) {
				2216	if (ret != -EAGAIN)
				2217	vq_err(vq, "Translation failure %d descriptor idx %d\n",
				2218	ret, i);
				2219	return ret;
				2220	}
				2221	if (access == VHOST_ACCESS_WO) {
				2222	/* If this is an input descriptor,
				2223	* increment that count. */
				2224	*in_num += ret;
				2225	if (unlikely(log && ret)) {
				2226	log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
				2227	log[*log_num].len = vhost32_to_cpu(vq, desc.len);
				2228	++*log_num;
				2229	}
				2230	} else {
				2231	/* If it's an output descriptor, they're all supposed
				2232	* to come before any input descriptors. */
				2233	if (unlikely(*in_num)) {
				2234	vq_err(vq, "Descriptor has out after in: "
				2235	"idx %d\n", i);
				2236	return -EINVAL;
				2237	}
				2238	*out_num += ret;
				2239	}
				2240	} while ((i = next_desc(vq, &desc)) != -1);
				2241
				2242	/* On success, increment avail index. */
				2243	vq->last_avail_idx++;
				2244
				2245	/* Assume notifications from guest are disabled at this point,
				2246	* if they aren't we would need to update avail_event index. */
				2247	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
				2248	return head;
				2249	}
				2250	EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
				2251
				2252	/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
				2253	void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
				2254	{
				2255	vq->last_avail_idx -= n;
				2256	}
				2257	EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
				2258
				2259	/* After we've used one of their buffers, we tell them about it. We'll then
				2260	* want to notify the guest, using eventfd. */
				2261	int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
				2262	{
				2263	struct vring_used_elem heads = {
				2264	cpu_to_vhost32(vq, head),
				2265	cpu_to_vhost32(vq, len)
				2266	};
				2267
				2268	return vhost_add_used_n(vq, &heads, 1);
				2269	}
				2270	EXPORT_SYMBOL_GPL(vhost_add_used);
				2271
				2272	static int __vhost_add_used_n(struct vhost_virtqueue *vq,
				2273	struct vring_used_elem *heads,
				2274	unsigned count)
				2275	{
				2276	struct vring_used_elem __user *used;
				2277	u16 old, new;
				2278	int start;
				2279
				2280	start = vq->last_used_idx & (vq->num - 1);
				2281	used = vq->used->ring + start;
				2282	if (count == 1) {
				2283	if (vhost_put_user(vq, heads[0].id, &used->id)) {
				2284	vq_err(vq, "Failed to write used id");
				2285	return -EFAULT;
				2286	}
				2287	if (vhost_put_user(vq, heads[0].len, &used->len)) {
				2288	vq_err(vq, "Failed to write used len");
				2289	return -EFAULT;
				2290	}
				2291	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
				2292	vq_err(vq, "Failed to write used");
				2293	return -EFAULT;
				2294	}
				2295	if (unlikely(vq->log_used)) {
				2296	/* Make sure data is seen before log. */
				2297	smp_wmb();
				2298	/* Log used ring entry write. */
				2299	log_used(vq, ((void __user )used - (void __user )vq->used),
				2300	count * sizeof *used);
				2301	}
				2302	old = vq->last_used_idx;
				2303	new = (vq->last_used_idx += count);
				2304	/* If the driver never bothers to signal in a very long while,
				2305	* used index might wrap around. If that happens, invalidate
				2306	* signalled_used index we stored. TODO: make sure driver
				2307	* signals at least once in 2^16 and remove this. */
				2308	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
				2309	vq->signalled_used_valid = false;
				2310	return 0;
				2311	}
				2312
				2313	/* After we've used one of their buffers, we tell them about it. We'll then
				2314	* want to notify the guest, using eventfd. */
				2315	int vhost_add_used_n(struct vhost_virtqueue vq, struct vring_used_elem heads,
				2316	unsigned count)
				2317	{
				2318	int start, n, r;
				2319
				2320	start = vq->last_used_idx & (vq->num - 1);
				2321	n = vq->num - start;
				2322	if (n < count) {
				2323	r = __vhost_add_used_n(vq, heads, n);
				2324	if (r < 0)
				2325	return r;
				2326	heads += n;
				2327	count -= n;
				2328	}
				2329	r = __vhost_add_used_n(vq, heads, count);
				2330
				2331	/* Make sure buffer is written before we update index. */
				2332	smp_wmb();
				2333	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
				2334	&vq->used->idx)) {
				2335	vq_err(vq, "Failed to increment used idx");
				2336	return -EFAULT;
				2337	}
				2338	if (unlikely(vq->log_used)) {
				2339	/* Make sure used idx is seen before log. */
				2340	smp_wmb();
				2341	/* Log used index update. */
				2342	log_used(vq, offsetof(struct vring_used, idx),
				2343	sizeof vq->used->idx);
				2344	if (vq->log_ctx)
				2345	eventfd_signal(vq->log_ctx, 1);
				2346	}
				2347	return r;
				2348	}
				2349	EXPORT_SYMBOL_GPL(vhost_add_used_n);
				2350
				2351	static bool vhost_notify(struct vhost_dev dev, struct vhost_virtqueue vq)
				2352	{
				2353	__u16 old, new;
				2354	__virtio16 event;
				2355	bool v;
				2356	/* Flush out used index updates. This is paired
				2357	* with the barrier that the Guest executes when enabling
				2358	* interrupts. */
				2359	smp_mb();
				2360
				2361	if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
				2362	unlikely(vq->avail_idx == vq->last_avail_idx))
				2363	return true;
				2364
				2365	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
				2366	__virtio16 flags;
				2367	if (vhost_get_avail(vq, flags, &vq->avail->flags)) {
				2368	vq_err(vq, "Failed to get flags");
				2369	return true;
				2370	}
				2371	return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
				2372	}
				2373	old = vq->signalled_used;
				2374	v = vq->signalled_used_valid;
				2375	new = vq->signalled_used = vq->last_used_idx;
				2376	vq->signalled_used_valid = true;
				2377
				2378	if (unlikely(!v))
				2379	return true;
				2380
				2381	if (vhost_get_avail(vq, event, vhost_used_event(vq))) {
				2382	vq_err(vq, "Failed to get used event idx");
				2383	return true;
				2384	}
				2385	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
				2386	}
				2387
				2388	/* This actually signals the guest, using eventfd. */
				2389	void vhost_signal(struct vhost_dev dev, struct vhost_virtqueue vq)
				2390	{
				2391	/* Signal the Guest tell them we used something up. */
				2392	if (vq->call_ctx && vhost_notify(dev, vq))
				2393	eventfd_signal(vq->call_ctx, 1);
				2394	}
				2395	EXPORT_SYMBOL_GPL(vhost_signal);
				2396
				2397	/* And here's the combo meal deal. Supersize me! */
				2398	void vhost_add_used_and_signal(struct vhost_dev *dev,
				2399	struct vhost_virtqueue *vq,
				2400	unsigned int head, int len)
				2401	{
				2402	vhost_add_used(vq, head, len);
				2403	vhost_signal(dev, vq);
				2404	}
				2405	EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
				2406
				2407	/* multi-buffer version of vhost_add_used_and_signal */
				2408	void vhost_add_used_and_signal_n(struct vhost_dev *dev,
				2409	struct vhost_virtqueue *vq,
				2410	struct vring_used_elem *heads, unsigned count)
				2411	{
				2412	vhost_add_used_n(vq, heads, count);
				2413	vhost_signal(dev, vq);
				2414	}
				2415	EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
				2416
				2417	/* return true if we're sure that avaiable ring is empty */
				2418	bool vhost_vq_avail_empty(struct vhost_dev dev, struct vhost_virtqueue vq)
				2419	{
				2420	__virtio16 avail_idx;
				2421	int r;
				2422
				2423	if (vq->avail_idx != vq->last_avail_idx)
				2424	return false;
				2425
				2426	r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
				2427	if (unlikely(r))
				2428	return false;
				2429	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
				2430
				2431	return vq->avail_idx == vq->last_avail_idx;
				2432	}
				2433	EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
				2434
				2435	/* OK, now we need to know about added descriptors. */
				2436	bool vhost_enable_notify(struct vhost_dev dev, struct vhost_virtqueue vq)
				2437	{
				2438	__virtio16 avail_idx;
				2439	int r;
				2440
				2441	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
				2442	return false;
				2443	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
				2444	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
				2445	r = vhost_update_used_flags(vq);
				2446	if (r) {
				2447	vq_err(vq, "Failed to enable notification at %p: %d\n",
				2448	&vq->used->flags, r);
				2449	return false;
				2450	}
				2451	} else {
				2452	r = vhost_update_avail_event(vq, vq->avail_idx);
				2453	if (r) {
				2454	vq_err(vq, "Failed to update avail event index at %p: %d\n",
				2455	vhost_avail_event(vq), r);
				2456	return false;
				2457	}
				2458	}
				2459	/* They could have slipped one in as we were doing that: make
				2460	* sure it's written, then check again. */
				2461	smp_mb();
				2462	r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
				2463	if (r) {
				2464	vq_err(vq, "Failed to check avail idx at %p: %d\n",
				2465	&vq->avail->idx, r);
				2466	return false;
				2467	}
				2468
				2469	return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
				2470	}
				2471	EXPORT_SYMBOL_GPL(vhost_enable_notify);
				2472
				2473	/* We don't need to be notified again. */
				2474	void vhost_disable_notify(struct vhost_dev dev, struct vhost_virtqueue vq)
				2475	{
				2476	int r;
				2477
				2478	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
				2479	return;
				2480	vq->used_flags \|= VRING_USED_F_NO_NOTIFY;
				2481	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
				2482	r = vhost_update_used_flags(vq);
				2483	if (r)
				2484	vq_err(vq, "Failed to enable notification at %p: %d\n",
				2485	&vq->used->flags, r);
				2486	}
				2487	}
				2488	EXPORT_SYMBOL_GPL(vhost_disable_notify);
				2489
				2490	/* Create a new message. */
				2491	struct vhost_msg_node vhost_new_msg(struct vhost_virtqueue vq, int type)
				2492	{
				2493	struct vhost_msg_node node = kmalloc(sizeof node, GFP_KERNEL);
				2494	if (!node)
				2495	return NULL;
				2496
				2497	/* Make sure all padding within the structure is initialized. */
				2498	memset(&node->msg, 0, sizeof node->msg);
				2499	node->vq = vq;
				2500	node->msg.type = type;
				2501	return node;
				2502	}
				2503	EXPORT_SYMBOL_GPL(vhost_new_msg);
				2504
				2505	void vhost_enqueue_msg(struct vhost_dev dev, struct list_head head,
				2506	struct vhost_msg_node *node)
				2507	{
				2508	spin_lock(&dev->iotlb_lock);
				2509	list_add_tail(&node->node, head);
				2510	spin_unlock(&dev->iotlb_lock);
				2511
				2512	wake_up_interruptible_poll(&dev->wait, EPOLLIN \| EPOLLRDNORM);
				2513	}
				2514	EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
				2515
				2516	struct vhost_msg_node vhost_dequeue_msg(struct vhost_dev dev,
				2517	struct list_head *head)
				2518	{
				2519	struct vhost_msg_node *node = NULL;
				2520
				2521	spin_lock(&dev->iotlb_lock);
				2522	if (!list_empty(head)) {
				2523	node = list_first_entry(head, struct vhost_msg_node,
				2524	node);
				2525	list_del(&node->node);
				2526	}
				2527	spin_unlock(&dev->iotlb_lock);
				2528
				2529	return node;
				2530	}
				2531	EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
				2532
				2533
				2534	static int __init vhost_init(void)
				2535	{
				2536	return 0;
				2537	}
				2538
				2539	static void __exit vhost_exit(void)
				2540	{
				2541	}
				2542
				2543	module_init(vhost_init);
				2544	module_exit(vhost_exit);
				2545
				2546	MODULE_VERSION("0.0.1");
				2547	MODULE_LICENSE("GPL v2");
				2548	MODULE_AUTHOR("Michael S. Tsirkin");
				2549	MODULE_DESCRIPTION("Host kernel accelerator for virtio");