Blame - marvell/linux/virt/kvm/kvm_main.c - T108

blob: 9b76bc0c5cd2c29c7c9ceb1a5bb431954010334e [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Kernel-based Virtual Machine driver for Linux
				4	*
				5	* This module enables machines with Intel VT-x extensions to run virtual
				6	* machines without emulation or binary translation.
				7	*
				8	* Copyright (C) 2006 Qumranet, Inc.
				9	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				10	*
				11	* Authors:
				12	* Avi Kivity <avi@qumranet.com>
				13	* Yaniv Kamay <yaniv@qumranet.com>
				14	*/
				15
				16	#include <kvm/iodev.h>
				17
				18	#include <linux/kvm_host.h>
				19	#include <linux/kvm.h>
				20	#include <linux/module.h>
				21	#include <linux/errno.h>
				22	#include <linux/percpu.h>
				23	#include <linux/mm.h>
				24	#include <linux/miscdevice.h>
				25	#include <linux/vmalloc.h>
				26	#include <linux/reboot.h>
				27	#include <linux/debugfs.h>
				28	#include <linux/highmem.h>
				29	#include <linux/file.h>
				30	#include <linux/syscore_ops.h>
				31	#include <linux/cpu.h>
				32	#include <linux/sched/signal.h>
				33	#include <linux/sched/mm.h>
				34	#include <linux/sched/stat.h>
				35	#include <linux/cpumask.h>
				36	#include <linux/smp.h>
				37	#include <linux/anon_inodes.h>
				38	#include <linux/profile.h>
				39	#include <linux/kvm_para.h>
				40	#include <linux/pagemap.h>
				41	#include <linux/mman.h>
				42	#include <linux/swap.h>
				43	#include <linux/bitops.h>
				44	#include <linux/spinlock.h>
				45	#include <linux/compat.h>
				46	#include <linux/srcu.h>
				47	#include <linux/hugetlb.h>
				48	#include <linux/slab.h>
				49	#include <linux/sort.h>
				50	#include <linux/bsearch.h>
				51	#include <linux/io.h>
				52	#include <linux/lockdep.h>
				53	#include <linux/kthread.h>
				54
				55	#include <asm/processor.h>
				56	#include <asm/ioctl.h>
				57	#include <linux/uaccess.h>
				58	#include <asm/pgtable.h>
				59
				60	#include "coalesced_mmio.h"
				61	#include "async_pf.h"
				62	#include "vfio.h"
				63
				64	#define CREATE_TRACE_POINTS
				65	#include <trace/events/kvm.h>
				66
				67	/* Worst case buffer size needed for holding an integer. */
				68	#define ITOA_MAX_LEN 12
				69
				70	MODULE_AUTHOR("Qumranet");
				71	MODULE_LICENSE("GPL");
				72
				73	/* Architectures should define their poll value according to the halt latency */
				74	unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
				75	module_param(halt_poll_ns, uint, 0644);
				76	EXPORT_SYMBOL_GPL(halt_poll_ns);
				77
				78	/* Default doubles per-vcpu halt_poll_ns. */
				79	unsigned int halt_poll_ns_grow = 2;
				80	module_param(halt_poll_ns_grow, uint, 0644);
				81	EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
				82
				83	/* The start value to grow halt_poll_ns from */
				84	unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
				85	module_param(halt_poll_ns_grow_start, uint, 0644);
				86	EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
				87
				88	/* Default resets per-vcpu halt_poll_ns . */
				89	unsigned int halt_poll_ns_shrink;
				90	module_param(halt_poll_ns_shrink, uint, 0644);
				91	EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
				92
				93	/*
				94	* Ordering of locks:
				95	*
				96	* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
				97	*/
				98
				99	DEFINE_MUTEX(kvm_lock);
				100	static DEFINE_RAW_SPINLOCK(kvm_count_lock);
				101	LIST_HEAD(vm_list);
				102
				103	static cpumask_var_t cpus_hardware_enabled;
				104	static int kvm_usage_count;
				105	static atomic_t hardware_enable_failed;
				106
				107	struct kmem_cache *kvm_vcpu_cache;
				108	EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
				109
				110	static __read_mostly struct preempt_ops kvm_preempt_ops;
				111
				112	struct dentry *kvm_debugfs_dir;
				113	EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
				114
				115	static int kvm_debugfs_num_entries;
				116	static const struct file_operations *stat_fops_per_vm[];
				117
				118	static struct file_operations kvm_chardev_ops;
				119
				120	static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
				121	unsigned long arg);
				122	#ifdef CONFIG_KVM_COMPAT
				123	static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
				124	unsigned long arg);
				125	#define KVM_COMPAT(c) .compat_ioctl = (c)
				126	#else
				127	/*
				128	* For architectures that don't implement a compat infrastructure,
				129	* adopt a double line of defense:
				130	* - Prevent a compat task from opening /dev/kvm
				131	* - If the open has been done by a 64bit task, and the KVM fd
				132	* passed to a compat task, let the ioctls fail.
				133	*/
				134	static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
				135	unsigned long arg) { return -EINVAL; }
				136
				137	static int kvm_no_compat_open(struct inode inode, struct file file)
				138	{
				139	return is_compat_task() ? -ENODEV : 0;
				140	}
				141	#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
				142	.open = kvm_no_compat_open
				143	#endif
				144	static int hardware_enable_all(void);
				145	static void hardware_disable_all(void);
				146
				147	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
				148
				149	static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
				150
				151	__visible bool kvm_rebooting;
				152	EXPORT_SYMBOL_GPL(kvm_rebooting);
				153
				154	static bool largepages_enabled = true;
				155
				156	#define KVM_EVENT_CREATE_VM 0
				157	#define KVM_EVENT_DESTROY_VM 1
				158	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
				159	static unsigned long long kvm_createvm_count;
				160	static unsigned long long kvm_active_vms;
				161
				162	__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
				163	unsigned long start, unsigned long end)
				164	{
				165	}
				166
				167	bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
				168	{
				169	/*
				170	* The metadata used by is_zone_device_page() to determine whether or
				171	* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
				172	* the device has been pinned, e.g. by get_user_pages(). WARN if the
				173	* page_count() is zero to help detect bad usage of this helper.
				174	*/
				175	if (!pfn_valid(pfn) \|\| WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
				176	return false;
				177
				178	return is_zone_device_page(pfn_to_page(pfn));
				179	}
				180
				181	bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
				182	{
				183	/*
				184	* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
				185	* perspective they are "normal" pages, albeit with slightly different
				186	* usage rules.
				187	*/
				188	if (pfn_valid(pfn))
				189	return PageReserved(pfn_to_page(pfn)) &&
				190	!is_zero_pfn(pfn) &&
				191	!kvm_is_zone_device_pfn(pfn);
				192
				193	return true;
				194	}
				195
				196	/*
				197	* Switches to specified vcpu, until a matching vcpu_put()
				198	*/
				199	void vcpu_load(struct kvm_vcpu *vcpu)
				200	{
				201	int cpu = get_cpu();
				202	preempt_notifier_register(&vcpu->preempt_notifier);
				203	kvm_arch_vcpu_load(vcpu, cpu);
				204	put_cpu();
				205	}
				206	EXPORT_SYMBOL_GPL(vcpu_load);
				207
				208	void vcpu_put(struct kvm_vcpu *vcpu)
				209	{
				210	preempt_disable();
				211	kvm_arch_vcpu_put(vcpu);
				212	preempt_notifier_unregister(&vcpu->preempt_notifier);
				213	preempt_enable();
				214	}
				215	EXPORT_SYMBOL_GPL(vcpu_put);
				216
				217	/* TODO: merge with kvm_arch_vcpu_should_kick */
				218	static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
				219	{
				220	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
				221
				222	/*
				223	* We need to wait for the VCPU to reenable interrupts and get out of
				224	* READING_SHADOW_PAGE_TABLES mode.
				225	*/
				226	if (req & KVM_REQUEST_WAIT)
				227	return mode != OUTSIDE_GUEST_MODE;
				228
				229	/*
				230	* Need to kick a running VCPU, but otherwise there is nothing to do.
				231	*/
				232	return mode == IN_GUEST_MODE;
				233	}
				234
				235	static void ack_flush(void *_completed)
				236	{
				237	}
				238
				239	static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
				240	{
				241	if (unlikely(!cpus))
				242	cpus = cpu_online_mask;
				243
				244	if (cpumask_empty(cpus))
				245	return false;
				246
				247	smp_call_function_many(cpus, ack_flush, NULL, wait);
				248	return true;
				249	}
				250
				251	bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
				252	unsigned long *vcpu_bitmap, cpumask_var_t tmp)
				253	{
				254	int i, cpu, me;
				255	struct kvm_vcpu *vcpu;
				256	bool called;
				257
				258	me = get_cpu();
				259
				260	kvm_for_each_vcpu(i, vcpu, kvm) {
				261	if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
				262	continue;
				263
				264	kvm_make_request(req, vcpu);
				265	cpu = vcpu->cpu;
				266
				267	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
				268	continue;
				269
				270	if (tmp != NULL && cpu != -1 && cpu != me &&
				271	kvm_request_needs_ipi(vcpu, req))
				272	__cpumask_set_cpu(cpu, tmp);
				273	}
				274
				275	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
				276	put_cpu();
				277
				278	return called;
				279	}
				280
				281	bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
				282	{
				283	cpumask_var_t cpus;
				284	bool called;
				285
				286	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
				287
				288	called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
				289
				290	free_cpumask_var(cpus);
				291	return called;
				292	}
				293
				294	#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
				295	void kvm_flush_remote_tlbs(struct kvm *kvm)
				296	{
				297	/*
				298	* Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
				299	* kvm_make_all_cpus_request.
				300	*/
				301	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
				302
				303	/*
				304	* We want to publish modifications to the page tables before reading
				305	* mode. Pairs with a memory barrier in arch-specific code.
				306	* - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
				307	* and smp_mb in walk_shadow_page_lockless_begin/end.
				308	* - powerpc: smp_mb in kvmppc_prepare_to_enter.
				309	*
				310	* There is already an smp_mb__after_atomic() before
				311	* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
				312	* barrier here.
				313	*/
				314	if (!kvm_arch_flush_remote_tlb(kvm)
				315	\|\| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
				316	++kvm->stat.remote_tlb_flush;
				317	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
				318	}
				319	EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
				320	#endif
				321
				322	void kvm_reload_remote_mmus(struct kvm *kvm)
				323	{
				324	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
				325	}
				326
				327	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
				328	{
				329	struct page *page;
				330	int r;
				331
				332	mutex_init(&vcpu->mutex);
				333	vcpu->cpu = -1;
				334	vcpu->kvm = kvm;
				335	vcpu->vcpu_id = id;
				336	vcpu->pid = NULL;
				337	init_swait_queue_head(&vcpu->wq);
				338	kvm_async_pf_vcpu_init(vcpu);
				339
				340	vcpu->pre_pcpu = -1;
				341	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
				342
				343	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				344	if (!page) {
				345	r = -ENOMEM;
				346	goto fail;
				347	}
				348	vcpu->run = page_address(page);
				349
				350	kvm_vcpu_set_in_spin_loop(vcpu, false);
				351	kvm_vcpu_set_dy_eligible(vcpu, false);
				352	vcpu->preempted = false;
				353	vcpu->ready = false;
				354
				355	r = kvm_arch_vcpu_init(vcpu);
				356	if (r < 0)
				357	goto fail_free_run;
				358	return 0;
				359
				360	fail_free_run:
				361	free_page((unsigned long)vcpu->run);
				362	fail:
				363	return r;
				364	}
				365	EXPORT_SYMBOL_GPL(kvm_vcpu_init);
				366
				367	void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
				368	{
				369	/*
				370	* no need for rcu_read_lock as VCPU_RUN is the only place that
				371	* will change the vcpu->pid pointer and on uninit all file
				372	* descriptors are already gone.
				373	*/
				374	put_pid(rcu_dereference_protected(vcpu->pid, 1));
				375	kvm_arch_vcpu_uninit(vcpu);
				376	free_page((unsigned long)vcpu->run);
				377	}
				378	EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
				379
				380	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				381	static inline struct kvm mmu_notifier_to_kvm(struct mmu_notifier mn)
				382	{
				383	return container_of(mn, struct kvm, mmu_notifier);
				384	}
				385
				386	static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
				387	struct mm_struct *mm,
				388	unsigned long start, unsigned long end)
				389	{
				390	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				391	int idx;
				392
				393	idx = srcu_read_lock(&kvm->srcu);
				394	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
				395	srcu_read_unlock(&kvm->srcu, idx);
				396	}
				397
				398	static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
				399	struct mm_struct *mm,
				400	unsigned long address,
				401	pte_t pte)
				402	{
				403	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				404	int idx;
				405
				406	idx = srcu_read_lock(&kvm->srcu);
				407	spin_lock(&kvm->mmu_lock);
				408	kvm->mmu_notifier_seq++;
				409
				410	if (kvm_set_spte_hva(kvm, address, pte))
				411	kvm_flush_remote_tlbs(kvm);
				412
				413	spin_unlock(&kvm->mmu_lock);
				414	srcu_read_unlock(&kvm->srcu, idx);
				415	}
				416
				417	static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
				418	const struct mmu_notifier_range *range)
				419	{
				420	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				421	int need_tlb_flush = 0, idx;
				422
				423	idx = srcu_read_lock(&kvm->srcu);
				424	spin_lock(&kvm->mmu_lock);
				425	/*
				426	* The count increase must become visible at unlock time as no
				427	* spte can be established without taking the mmu_lock and
				428	* count is also read inside the mmu_lock critical section.
				429	*/
				430	kvm->mmu_notifier_count++;
				431	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
				432	range->flags);
				433	/* we've to flush the tlb before the pages can be freed */
				434	if (need_tlb_flush \|\| kvm->tlbs_dirty)
				435	kvm_flush_remote_tlbs(kvm);
				436
				437	spin_unlock(&kvm->mmu_lock);
				438	srcu_read_unlock(&kvm->srcu, idx);
				439
				440	return 0;
				441	}
				442
				443	static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
				444	const struct mmu_notifier_range *range)
				445	{
				446	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				447
				448	spin_lock(&kvm->mmu_lock);
				449	/*
				450	* This sequence increase will notify the kvm page fault that
				451	* the page that is going to be mapped in the spte could have
				452	* been freed.
				453	*/
				454	kvm->mmu_notifier_seq++;
				455	smp_wmb();
				456	/*
				457	* The above sequence increase must be visible before the
				458	* below count decrease, which is ensured by the smp_wmb above
				459	* in conjunction with the smp_rmb in mmu_notifier_retry().
				460	*/
				461	kvm->mmu_notifier_count--;
				462	spin_unlock(&kvm->mmu_lock);
				463
				464	BUG_ON(kvm->mmu_notifier_count < 0);
				465	}
				466
				467	static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
				468	struct mm_struct *mm,
				469	unsigned long start,
				470	unsigned long end)
				471	{
				472	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				473	int young, idx;
				474
				475	idx = srcu_read_lock(&kvm->srcu);
				476	spin_lock(&kvm->mmu_lock);
				477
				478	young = kvm_age_hva(kvm, start, end);
				479	if (young)
				480	kvm_flush_remote_tlbs(kvm);
				481
				482	spin_unlock(&kvm->mmu_lock);
				483	srcu_read_unlock(&kvm->srcu, idx);
				484
				485	return young;
				486	}
				487
				488	static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
				489	struct mm_struct *mm,
				490	unsigned long start,
				491	unsigned long end)
				492	{
				493	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				494	int young, idx;
				495
				496	idx = srcu_read_lock(&kvm->srcu);
				497	spin_lock(&kvm->mmu_lock);
				498	/*
				499	* Even though we do not flush TLB, this will still adversely
				500	* affect performance on pre-Haswell Intel EPT, where there is
				501	* no EPT Access Bit to clear so that we have to tear down EPT
				502	* tables instead. If we find this unacceptable, we can always
				503	* add a parameter to kvm_age_hva so that it effectively doesn't
				504	* do anything on clear_young.
				505	*
				506	* Also note that currently we never issue secondary TLB flushes
				507	* from clear_young, leaving this job up to the regular system
				508	* cadence. If we find this inaccurate, we might come up with a
				509	* more sophisticated heuristic later.
				510	*/
				511	young = kvm_age_hva(kvm, start, end);
				512	spin_unlock(&kvm->mmu_lock);
				513	srcu_read_unlock(&kvm->srcu, idx);
				514
				515	return young;
				516	}
				517
				518	static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
				519	struct mm_struct *mm,
				520	unsigned long address)
				521	{
				522	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				523	int young, idx;
				524
				525	idx = srcu_read_lock(&kvm->srcu);
				526	spin_lock(&kvm->mmu_lock);
				527	young = kvm_test_age_hva(kvm, address);
				528	spin_unlock(&kvm->mmu_lock);
				529	srcu_read_unlock(&kvm->srcu, idx);
				530
				531	return young;
				532	}
				533
				534	static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
				535	struct mm_struct *mm)
				536	{
				537	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				538	int idx;
				539
				540	idx = srcu_read_lock(&kvm->srcu);
				541	kvm_arch_flush_shadow_all(kvm);
				542	srcu_read_unlock(&kvm->srcu, idx);
				543	}
				544
				545	static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
				546	.invalidate_range = kvm_mmu_notifier_invalidate_range,
				547	.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
				548	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
				549	.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
				550	.clear_young = kvm_mmu_notifier_clear_young,
				551	.test_young = kvm_mmu_notifier_test_young,
				552	.change_pte = kvm_mmu_notifier_change_pte,
				553	.release = kvm_mmu_notifier_release,
				554	};
				555
				556	static int kvm_init_mmu_notifier(struct kvm *kvm)
				557	{
				558	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
				559	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
				560	}
				561
				562	#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
				563
				564	static int kvm_init_mmu_notifier(struct kvm *kvm)
				565	{
				566	return 0;
				567	}
				568
				569	#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
				570
				571	static struct kvm_memslots *kvm_alloc_memslots(void)
				572	{
				573	int i;
				574	struct kvm_memslots *slots;
				575
				576	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
				577	if (!slots)
				578	return NULL;
				579
				580	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
				581	slots->id_to_index[i] = slots->memslots[i].id = i;
				582
				583	return slots;
				584	}
				585
				586	static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
				587	{
				588	if (!memslot->dirty_bitmap)
				589	return;
				590
				591	kvfree(memslot->dirty_bitmap);
				592	memslot->dirty_bitmap = NULL;
				593	}
				594
				595	/*
				596	* Free any memory in @free but not in @dont.
				597	*/
				598	static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
				599	struct kvm_memory_slot *dont)
				600	{
				601	if (!dont \|\| free->dirty_bitmap != dont->dirty_bitmap)
				602	kvm_destroy_dirty_bitmap(free);
				603
				604	kvm_arch_free_memslot(kvm, free, dont);
				605
				606	free->npages = 0;
				607	}
				608
				609	static void kvm_free_memslots(struct kvm kvm, struct kvm_memslots slots)
				610	{
				611	struct kvm_memory_slot *memslot;
				612
				613	if (!slots)
				614	return;
				615
				616	kvm_for_each_memslot(memslot, slots)
				617	kvm_free_memslot(kvm, memslot, NULL);
				618
				619	kvfree(slots);
				620	}
				621
				622	static void kvm_destroy_vm_debugfs(struct kvm *kvm)
				623	{
				624	int i;
				625
				626	if (!kvm->debugfs_dentry)
				627	return;
				628
				629	debugfs_remove_recursive(kvm->debugfs_dentry);
				630
				631	if (kvm->debugfs_stat_data) {
				632	for (i = 0; i < kvm_debugfs_num_entries; i++)
				633	kfree(kvm->debugfs_stat_data[i]);
				634	kfree(kvm->debugfs_stat_data);
				635	}
				636	}
				637
				638	static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
				639	{
				640	static DEFINE_MUTEX(kvm_debugfs_lock);
				641	struct dentry *dent;
				642	char dir_name[ITOA_MAX_LEN * 2];
				643	struct kvm_stat_data *stat_data;
				644	struct kvm_stats_debugfs_item *p;
				645
				646	if (!debugfs_initialized())
				647	return 0;
				648
				649	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
				650	mutex_lock(&kvm_debugfs_lock);
				651	dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
				652	if (dent) {
				653	pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
				654	dput(dent);
				655	mutex_unlock(&kvm_debugfs_lock);
				656	return 0;
				657	}
				658	dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
				659	mutex_unlock(&kvm_debugfs_lock);
				660	if (IS_ERR(dent))
				661	return 0;
				662
				663	kvm->debugfs_dentry = dent;
				664	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
				665	sizeof(*kvm->debugfs_stat_data),
				666	GFP_KERNEL_ACCOUNT);
				667	if (!kvm->debugfs_stat_data)
				668	return -ENOMEM;
				669
				670	for (p = debugfs_entries; p->name; p++) {
				671	stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
				672	if (!stat_data)
				673	return -ENOMEM;
				674
				675	stat_data->kvm = kvm;
				676	stat_data->offset = p->offset;
				677	stat_data->mode = p->mode ? p->mode : 0644;
				678	kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
				679	debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
				680	stat_data, stat_fops_per_vm[p->kind]);
				681	}
				682	return 0;
				683	}
				684
				685	/*
				686	* Called after the VM is otherwise initialized, but just before adding it to
				687	* the vm_list.
				688	*/
				689	int __weak kvm_arch_post_init_vm(struct kvm *kvm)
				690	{
				691	return 0;
				692	}
				693
				694	/*
				695	* Called just after removing the VM from the vm_list, but before doing any
				696	* other destruction.
				697	*/
				698	void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
				699	{
				700	}
				701
				702	static struct kvm *kvm_create_vm(unsigned long type)
				703	{
				704	struct kvm *kvm = kvm_arch_alloc_vm();
				705	int r = -ENOMEM;
				706	int i;
				707
				708	if (!kvm)
				709	return ERR_PTR(-ENOMEM);
				710
				711	spin_lock_init(&kvm->mmu_lock);
				712	mmgrab(current->mm);
				713	kvm->mm = current->mm;
				714	kvm_eventfd_init(kvm);
				715	mutex_init(&kvm->lock);
				716	mutex_init(&kvm->irq_lock);
				717	mutex_init(&kvm->slots_lock);
				718	INIT_LIST_HEAD(&kvm->devices);
				719
				720	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
				721
				722	if (init_srcu_struct(&kvm->srcu))
				723	goto out_err_no_srcu;
				724	if (init_srcu_struct(&kvm->irq_srcu))
				725	goto out_err_no_irq_srcu;
				726
				727	refcount_set(&kvm->users_count, 1);
				728	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				729	struct kvm_memslots *slots = kvm_alloc_memslots();
				730
				731	if (!slots)
				732	goto out_err_no_arch_destroy_vm;
				733	/* Generations must be different for each address space. */
				734	slots->generation = i;
				735	rcu_assign_pointer(kvm->memslots[i], slots);
				736	}
				737
				738	for (i = 0; i < KVM_NR_BUSES; i++) {
				739	rcu_assign_pointer(kvm->buses[i],
				740	kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
				741	if (!kvm->buses[i])
				742	goto out_err_no_arch_destroy_vm;
				743	}
				744
				745	r = kvm_arch_init_vm(kvm, type);
				746	if (r)
				747	goto out_err_no_arch_destroy_vm;
				748
				749	r = hardware_enable_all();
				750	if (r)
				751	goto out_err_no_disable;
				752
				753	#ifdef CONFIG_HAVE_KVM_IRQFD
				754	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
				755	#endif
				756
				757	r = kvm_init_mmu_notifier(kvm);
				758	if (r)
				759	goto out_err_no_mmu_notifier;
				760
				761	r = kvm_arch_post_init_vm(kvm);
				762	if (r)
				763	goto out_err;
				764
				765	mutex_lock(&kvm_lock);
				766	list_add(&kvm->vm_list, &vm_list);
				767	mutex_unlock(&kvm_lock);
				768
				769	preempt_notifier_inc();
				770
				771	/*
				772	* When the fd passed to this ioctl() is opened it pins the module,
				773	* but try_module_get() also prevents getting a reference if the module
				774	* is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
				775	*/
				776	if (!try_module_get(kvm_chardev_ops.owner)) {
				777	r = -ENODEV;
				778	goto out_err;
				779	}
				780
				781	return kvm;
				782
				783	out_err:
				784	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				785	if (kvm->mmu_notifier.ops)
				786	mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
				787	#endif
				788	out_err_no_mmu_notifier:
				789	hardware_disable_all();
				790	out_err_no_disable:
				791	kvm_arch_destroy_vm(kvm);
				792	out_err_no_arch_destroy_vm:
				793	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
				794	for (i = 0; i < KVM_NR_BUSES; i++)
				795	kfree(kvm_get_bus(kvm, i));
				796	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
				797	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
				798	cleanup_srcu_struct(&kvm->irq_srcu);
				799	out_err_no_irq_srcu:
				800	cleanup_srcu_struct(&kvm->srcu);
				801	out_err_no_srcu:
				802	kvm_arch_free_vm(kvm);
				803	mmdrop(current->mm);
				804	return ERR_PTR(r);
				805	}
				806
				807	static void kvm_destroy_devices(struct kvm *kvm)
				808	{
				809	struct kvm_device dev, tmp;
				810
				811	/*
				812	* We do not need to take the kvm->lock here, because nobody else
				813	* has a reference to the struct kvm at this point and therefore
				814	* cannot access the devices list anyhow.
				815	*/
				816	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
				817	list_del(&dev->vm_node);
				818	dev->ops->destroy(dev);
				819	}
				820	}
				821
				822	static void kvm_destroy_vm(struct kvm *kvm)
				823	{
				824	int i;
				825	struct mm_struct *mm = kvm->mm;
				826
				827	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
				828	kvm_destroy_vm_debugfs(kvm);
				829	kvm_arch_sync_events(kvm);
				830	mutex_lock(&kvm_lock);
				831	list_del(&kvm->vm_list);
				832	mutex_unlock(&kvm_lock);
				833	kvm_arch_pre_destroy_vm(kvm);
				834
				835	kvm_free_irq_routing(kvm);
				836	for (i = 0; i < KVM_NR_BUSES; i++) {
				837	struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
				838
				839	if (bus)
				840	kvm_io_bus_destroy(bus);
				841	kvm->buses[i] = NULL;
				842	}
				843	kvm_coalesced_mmio_free(kvm);
				844	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				845	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
				846	#else
				847	kvm_arch_flush_shadow_all(kvm);
				848	#endif
				849	kvm_arch_destroy_vm(kvm);
				850	kvm_destroy_devices(kvm);
				851	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
				852	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
				853	cleanup_srcu_struct(&kvm->irq_srcu);
				854	cleanup_srcu_struct(&kvm->srcu);
				855	kvm_arch_free_vm(kvm);
				856	preempt_notifier_dec();
				857	hardware_disable_all();
				858	mmdrop(mm);
				859	module_put(kvm_chardev_ops.owner);
				860	}
				861
				862	void kvm_get_kvm(struct kvm *kvm)
				863	{
				864	refcount_inc(&kvm->users_count);
				865	}
				866	EXPORT_SYMBOL_GPL(kvm_get_kvm);
				867
				868	void kvm_put_kvm(struct kvm *kvm)
				869	{
				870	if (refcount_dec_and_test(&kvm->users_count))
				871	kvm_destroy_vm(kvm);
				872	}
				873	EXPORT_SYMBOL_GPL(kvm_put_kvm);
				874
				875
				876	static int kvm_vm_release(struct inode inode, struct file filp)
				877	{
				878	struct kvm *kvm = filp->private_data;
				879
				880	kvm_irqfd_release(kvm);
				881
				882	kvm_put_kvm(kvm);
				883	return 0;
				884	}
				885
				886	/*
				887	* Allocation size is twice as large as the actual dirty bitmap size.
				888	* See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
				889	*/
				890	static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
				891	{
				892	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
				893
				894	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
				895	if (!memslot->dirty_bitmap)
				896	return -ENOMEM;
				897
				898	return 0;
				899	}
				900
				901	/*
				902	* Insert memslot and re-sort memslots based on their GFN,
				903	* so binary search could be used to lookup GFN.
				904	* Sorting algorithm takes advantage of having initially
				905	* sorted array and known changed memslot position.
				906	*/
				907	static void update_memslots(struct kvm_memslots *slots,
				908	struct kvm_memory_slot *new,
				909	enum kvm_mr_change change)
				910	{
				911	int id = new->id;
				912	int i = slots->id_to_index[id];
				913	struct kvm_memory_slot *mslots = slots->memslots;
				914
				915	WARN_ON(mslots[i].id != id);
				916	switch (change) {
				917	case KVM_MR_CREATE:
				918	slots->used_slots++;
				919	WARN_ON(mslots[i].npages \|\| !new->npages);
				920	break;
				921	case KVM_MR_DELETE:
				922	slots->used_slots--;
				923	WARN_ON(new->npages \|\| !mslots[i].npages);
				924	break;
				925	default:
				926	break;
				927	}
				928
				929	while (i < KVM_MEM_SLOTS_NUM - 1 &&
				930	new->base_gfn <= mslots[i + 1].base_gfn) {
				931	if (!mslots[i + 1].npages)
				932	break;
				933	mslots[i] = mslots[i + 1];
				934	slots->id_to_index[mslots[i].id] = i;
				935	i++;
				936	}
				937
				938	/*
				939	* The ">=" is needed when creating a slot with base_gfn == 0,
				940	* so that it moves before all those with base_gfn == npages == 0.
				941	*
				942	* On the other hand, if new->npages is zero, the above loop has
				943	* already left i pointing to the beginning of the empty part of
				944	* mslots, and the ">=" would move the hole backwards in this
				945	* case---which is wrong. So skip the loop when deleting a slot.
				946	*/
				947	if (new->npages) {
				948	while (i > 0 &&
				949	new->base_gfn >= mslots[i - 1].base_gfn) {
				950	mslots[i] = mslots[i - 1];
				951	slots->id_to_index[mslots[i].id] = i;
				952	i--;
				953	}
				954	} else
				955	WARN_ON_ONCE(i != slots->used_slots);
				956
				957	mslots[i] = *new;
				958	slots->id_to_index[mslots[i].id] = i;
				959	}
				960
				961	static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
				962	{
				963	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
				964
				965	#ifdef __KVM_HAVE_READONLY_MEM
				966	valid_flags \|= KVM_MEM_READONLY;
				967	#endif
				968
				969	if (mem->flags & ~valid_flags)
				970	return -EINVAL;
				971
				972	return 0;
				973	}
				974
				975	static struct kvm_memslots install_new_memslots(struct kvm kvm,
				976	int as_id, struct kvm_memslots *slots)
				977	{
				978	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
				979	u64 gen = old_memslots->generation;
				980
				981	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
				982	slots->generation = gen \| KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
				983
				984	rcu_assign_pointer(kvm->memslots[as_id], slots);
				985	synchronize_srcu_expedited(&kvm->srcu);
				986
				987	/*
				988	* Increment the new memslot generation a second time, dropping the
				989	* update in-progress flag and incrementing then generation based on
				990	* the number of address spaces. This provides a unique and easily
				991	* identifiable generation number while the memslots are in flux.
				992	*/
				993	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
				994
				995	/*
				996	* Generations must be unique even across address spaces. We do not need
				997	* a global counter for that, instead the generation space is evenly split
				998	* across address spaces. For example, with two address spaces, address
				999	* space 0 will use generations 0, 2, 4, ... while address space 1 will
				1000	* use generations 1, 3, 5, ...
				1001	*/
				1002	gen += KVM_ADDRESS_SPACE_NUM;
				1003
				1004	kvm_arch_memslots_updated(kvm, gen);
				1005
				1006	slots->generation = gen;
				1007
				1008	return old_memslots;
				1009	}
				1010
				1011	/*
				1012	* Allocate some memory and give it an address in the guest physical address
				1013	* space.
				1014	*
				1015	* Discontiguous memory is allowed, mostly for framebuffers.
				1016	*
				1017	* Must be called holding kvm->slots_lock for write.
				1018	*/
				1019	int __kvm_set_memory_region(struct kvm *kvm,
				1020	const struct kvm_userspace_memory_region *mem)
				1021	{
				1022	int r;
				1023	gfn_t base_gfn;
				1024	unsigned long npages;
				1025	struct kvm_memory_slot *slot;
				1026	struct kvm_memory_slot old, new;
				1027	struct kvm_memslots slots = NULL, old_memslots;
				1028	int as_id, id;
				1029	enum kvm_mr_change change;
				1030
				1031	r = check_memory_region_flags(mem);
				1032	if (r)
				1033	goto out;
				1034
				1035	r = -EINVAL;
				1036	as_id = mem->slot >> 16;
				1037	id = (u16)mem->slot;
				1038
				1039	/* General sanity checks */
				1040	if (mem->memory_size & (PAGE_SIZE - 1))
				1041	goto out;
				1042	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
				1043	goto out;
				1044	/* We can read the guest memory with __xxx_user() later on. */
				1045	if ((id < KVM_USER_MEM_SLOTS) &&
				1046	((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
				1047	(mem->userspace_addr != untagged_addr(mem->userspace_addr)) \|\|
				1048	!access_ok((void __user *)(unsigned long)mem->userspace_addr,
				1049	mem->memory_size)))
				1050	goto out;
				1051	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_MEM_SLOTS_NUM)
				1052	goto out;
				1053	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
				1054	goto out;
				1055
				1056	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
				1057	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
				1058	npages = mem->memory_size >> PAGE_SHIFT;
				1059
				1060	if (npages > KVM_MEM_MAX_NR_PAGES)
				1061	goto out;
				1062
				1063	new = old = *slot;
				1064
				1065	new.id = id;
				1066	new.base_gfn = base_gfn;
				1067	new.npages = npages;
				1068	new.flags = mem->flags;
				1069
				1070	if (npages) {
				1071	if (!old.npages)
				1072	change = KVM_MR_CREATE;
				1073	else { /* Modify an existing slot. */
				1074	if ((mem->userspace_addr != old.userspace_addr) \|\|
				1075	(npages != old.npages) \|\|
				1076	((new.flags ^ old.flags) & KVM_MEM_READONLY))
				1077	goto out;
				1078
				1079	if (base_gfn != old.base_gfn)
				1080	change = KVM_MR_MOVE;
				1081	else if (new.flags != old.flags)
				1082	change = KVM_MR_FLAGS_ONLY;
				1083	else { /* Nothing to change. */
				1084	r = 0;
				1085	goto out;
				1086	}
				1087	}
				1088	} else {
				1089	if (!old.npages)
				1090	goto out;
				1091
				1092	change = KVM_MR_DELETE;
				1093	new.base_gfn = 0;
				1094	new.flags = 0;
				1095	}
				1096
				1097	if ((change == KVM_MR_CREATE) \|\| (change == KVM_MR_MOVE)) {
				1098	/* Check for overlaps */
				1099	r = -EEXIST;
				1100	kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
				1101	if (slot->id == id)
				1102	continue;
				1103	if (!((base_gfn + npages <= slot->base_gfn) \|\|
				1104	(base_gfn >= slot->base_gfn + slot->npages)))
				1105	goto out;
				1106	}
				1107	}
				1108
				1109	/* Free page dirty bitmap if unneeded */
				1110	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
				1111	new.dirty_bitmap = NULL;
				1112
				1113	r = -ENOMEM;
				1114	if (change == KVM_MR_CREATE) {
				1115	new.userspace_addr = mem->userspace_addr;
				1116
				1117	if (kvm_arch_create_memslot(kvm, &new, npages))
				1118	goto out_free;
				1119	}
				1120
				1121	/* Allocate page dirty bitmap if needed */
				1122	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
				1123	if (kvm_create_dirty_bitmap(&new) < 0)
				1124	goto out_free;
				1125	}
				1126
				1127	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
				1128	if (!slots)
				1129	goto out_free;
				1130	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
				1131
				1132	if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {
				1133	slot = id_to_memslot(slots, id);
				1134	slot->flags \|= KVM_MEMSLOT_INVALID;
				1135
				1136	old_memslots = install_new_memslots(kvm, as_id, slots);
				1137
				1138	/* From this point no new shadow pages pointing to a deleted,
				1139	* or moved, memslot will be created.
				1140	*
				1141	* validation of sp->gfn happens in:
				1142	* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
				1143	* - kvm_is_visible_gfn (mmu_check_roots)
				1144	*/
				1145	kvm_arch_flush_shadow_memslot(kvm, slot);
				1146
				1147	/*
				1148	* We can re-use the old_memslots from above, the only difference
				1149	* from the currently installed memslots is the invalid flag. This
				1150	* will get overwritten by update_memslots anyway.
				1151	*/
				1152	slots = old_memslots;
				1153	}
				1154
				1155	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
				1156	if (r)
				1157	goto out_slots;
				1158
				1159	/* actual memory is freed via old in kvm_free_memslot below */
				1160	if (change == KVM_MR_DELETE) {
				1161	new.dirty_bitmap = NULL;
				1162	memset(&new.arch, 0, sizeof(new.arch));
				1163	}
				1164
				1165	update_memslots(slots, &new, change);
				1166	old_memslots = install_new_memslots(kvm, as_id, slots);
				1167
				1168	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
				1169
				1170	kvm_free_memslot(kvm, &old, &new);
				1171	kvfree(old_memslots);
				1172	return 0;
				1173
				1174	out_slots:
				1175	kvfree(slots);
				1176	out_free:
				1177	kvm_free_memslot(kvm, &new, &old);
				1178	out:
				1179	return r;
				1180	}
				1181	EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
				1182
				1183	int kvm_set_memory_region(struct kvm *kvm,
				1184	const struct kvm_userspace_memory_region *mem)
				1185	{
				1186	int r;
				1187
				1188	mutex_lock(&kvm->slots_lock);
				1189	r = __kvm_set_memory_region(kvm, mem);
				1190	mutex_unlock(&kvm->slots_lock);
				1191	return r;
				1192	}
				1193	EXPORT_SYMBOL_GPL(kvm_set_memory_region);
				1194
				1195	static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
				1196	struct kvm_userspace_memory_region *mem)
				1197	{
				1198	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
				1199	return -EINVAL;
				1200
				1201	return kvm_set_memory_region(kvm, mem);
				1202	}
				1203
				1204	int kvm_get_dirty_log(struct kvm *kvm,
				1205	struct kvm_dirty_log log, int is_dirty)
				1206	{
				1207	struct kvm_memslots *slots;
				1208	struct kvm_memory_slot *memslot;
				1209	int i, as_id, id;
				1210	unsigned long n;
				1211	unsigned long any = 0;
				1212
				1213	as_id = log->slot >> 16;
				1214	id = (u16)log->slot;
				1215	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1216	return -EINVAL;
				1217
				1218	slots = __kvm_memslots(kvm, as_id);
				1219	memslot = id_to_memslot(slots, id);
				1220	if (!memslot->dirty_bitmap)
				1221	return -ENOENT;
				1222
				1223	n = kvm_dirty_bitmap_bytes(memslot);
				1224
				1225	for (i = 0; !any && i < n/sizeof(long); ++i)
				1226	any = memslot->dirty_bitmap[i];
				1227
				1228	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
				1229	return -EFAULT;
				1230
				1231	if (any)
				1232	*is_dirty = 1;
				1233	return 0;
				1234	}
				1235	EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
				1236
				1237	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				1238	/**
				1239	* kvm_get_dirty_log_protect - get a snapshot of dirty pages
				1240	* and reenable dirty page tracking for the corresponding pages.
				1241	* @kvm: pointer to kvm instance
				1242	* @log: slot id and address to which we copy the log
				1243	* @flush: true if TLB flush is needed by caller
				1244	*
				1245	* We need to keep it in mind that VCPU threads can write to the bitmap
				1246	* concurrently. So, to avoid losing track of dirty pages we keep the
				1247	* following order:
				1248	*
				1249	* 1. Take a snapshot of the bit and clear it if needed.
				1250	* 2. Write protect the corresponding page.
				1251	* 3. Copy the snapshot to the userspace.
				1252	* 4. Upon return caller flushes TLB's if needed.
				1253	*
				1254	* Between 2 and 4, the guest may write to the page using the remaining TLB
				1255	* entry. This is not a problem because the page is reported dirty using
				1256	* the snapshot taken before and step 4 ensures that writes done after
				1257	* exiting to userspace will be logged for the next call.
				1258	*
				1259	*/
				1260	int kvm_get_dirty_log_protect(struct kvm *kvm,
				1261	struct kvm_dirty_log log, bool flush)
				1262	{
				1263	struct kvm_memslots *slots;
				1264	struct kvm_memory_slot *memslot;
				1265	int i, as_id, id;
				1266	unsigned long n;
				1267	unsigned long *dirty_bitmap;
				1268	unsigned long *dirty_bitmap_buffer;
				1269
				1270	as_id = log->slot >> 16;
				1271	id = (u16)log->slot;
				1272	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1273	return -EINVAL;
				1274
				1275	slots = __kvm_memslots(kvm, as_id);
				1276	memslot = id_to_memslot(slots, id);
				1277
				1278	dirty_bitmap = memslot->dirty_bitmap;
				1279	if (!dirty_bitmap)
				1280	return -ENOENT;
				1281
				1282	n = kvm_dirty_bitmap_bytes(memslot);
				1283	*flush = false;
				1284	if (kvm->manual_dirty_log_protect) {
				1285	/*
				1286	* Unlike kvm_get_dirty_log, we always return false in *flush,
				1287	* because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
				1288	* is some code duplication between this function and
				1289	* kvm_get_dirty_log, but hopefully all architecture
				1290	* transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
				1291	* can be eliminated.
				1292	*/
				1293	dirty_bitmap_buffer = dirty_bitmap;
				1294	} else {
				1295	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
				1296	memset(dirty_bitmap_buffer, 0, n);
				1297
				1298	spin_lock(&kvm->mmu_lock);
				1299	for (i = 0; i < n / sizeof(long); i++) {
				1300	unsigned long mask;
				1301	gfn_t offset;
				1302
				1303	if (!dirty_bitmap[i])
				1304	continue;
				1305
				1306	*flush = true;
				1307	mask = xchg(&dirty_bitmap[i], 0);
				1308	dirty_bitmap_buffer[i] = mask;
				1309
				1310	offset = i * BITS_PER_LONG;
				1311	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
				1312	offset, mask);
				1313	}
				1314	spin_unlock(&kvm->mmu_lock);
				1315	}
				1316
				1317	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
				1318	return -EFAULT;
				1319	return 0;
				1320	}
				1321	EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
				1322
				1323	/**
				1324	* kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
				1325	* and reenable dirty page tracking for the corresponding pages.
				1326	* @kvm: pointer to kvm instance
				1327	* @log: slot id and address from which to fetch the bitmap of dirty pages
				1328	* @flush: true if TLB flush is needed by caller
				1329	*/
				1330	int kvm_clear_dirty_log_protect(struct kvm *kvm,
				1331	struct kvm_clear_dirty_log log, bool flush)
				1332	{
				1333	struct kvm_memslots *slots;
				1334	struct kvm_memory_slot *memslot;
				1335	int as_id, id;
				1336	gfn_t offset;
				1337	unsigned long i, n;
				1338	unsigned long *dirty_bitmap;
				1339	unsigned long *dirty_bitmap_buffer;
				1340
				1341	as_id = log->slot >> 16;
				1342	id = (u16)log->slot;
				1343	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1344	return -EINVAL;
				1345
				1346	if (log->first_page & 63)
				1347	return -EINVAL;
				1348
				1349	slots = __kvm_memslots(kvm, as_id);
				1350	memslot = id_to_memslot(slots, id);
				1351
				1352	dirty_bitmap = memslot->dirty_bitmap;
				1353	if (!dirty_bitmap)
				1354	return -ENOENT;
				1355
				1356	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
				1357
				1358	if (log->first_page > memslot->npages \|\|
				1359	log->num_pages > memslot->npages - log->first_page \|\|
				1360	(log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
				1361	return -EINVAL;
				1362
				1363	*flush = false;
				1364	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
				1365	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
				1366	return -EFAULT;
				1367
				1368	spin_lock(&kvm->mmu_lock);
				1369	for (offset = log->first_page, i = offset / BITS_PER_LONG,
				1370	n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
				1371	i++, offset += BITS_PER_LONG) {
				1372	unsigned long mask = *dirty_bitmap_buffer++;
				1373	atomic_long_t p = (atomic_long_t ) &dirty_bitmap[i];
				1374	if (!mask)
				1375	continue;
				1376
				1377	mask &= atomic_long_fetch_andnot(mask, p);
				1378
				1379	/*
				1380	* mask contains the bits that really have been cleared. This
				1381	* never includes any bits beyond the length of the memslot (if
				1382	* the length is not aligned to 64 pages), therefore it is not
				1383	* a problem if userspace sets them in log->dirty_bitmap.
				1384	*/
				1385	if (mask) {
				1386	*flush = true;
				1387	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
				1388	offset, mask);
				1389	}
				1390	}
				1391	spin_unlock(&kvm->mmu_lock);
				1392
				1393	return 0;
				1394	}
				1395	EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
				1396	#endif
				1397
				1398	bool kvm_largepages_enabled(void)
				1399	{
				1400	return largepages_enabled;
				1401	}
				1402
				1403	void kvm_disable_largepages(void)
				1404	{
				1405	largepages_enabled = false;
				1406	}
				1407	EXPORT_SYMBOL_GPL(kvm_disable_largepages);
				1408
				1409	struct kvm_memory_slot gfn_to_memslot(struct kvm kvm, gfn_t gfn)
				1410	{
				1411	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
				1412	}
				1413	EXPORT_SYMBOL_GPL(gfn_to_memslot);
				1414
				1415	struct kvm_memory_slot kvm_vcpu_gfn_to_memslot(struct kvm_vcpu vcpu, gfn_t gfn)
				1416	{
				1417	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
				1418	}
				1419
				1420	bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
				1421	{
				1422	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
				1423
				1424	if (!memslot \|\| memslot->id >= KVM_USER_MEM_SLOTS \|\|
				1425	memslot->flags & KVM_MEMSLOT_INVALID)
				1426	return false;
				1427
				1428	return true;
				1429	}
				1430	EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
				1431
				1432	unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
				1433	{
				1434	struct vm_area_struct *vma;
				1435	unsigned long addr, size;
				1436
				1437	size = PAGE_SIZE;
				1438
				1439	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
				1440	if (kvm_is_error_hva(addr))
				1441	return PAGE_SIZE;
				1442
				1443	down_read(&current->mm->mmap_sem);
				1444	vma = find_vma(current->mm, addr);
				1445	if (!vma)
				1446	goto out;
				1447
				1448	size = vma_kernel_pagesize(vma);
				1449
				1450	out:
				1451	up_read(&current->mm->mmap_sem);
				1452
				1453	return size;
				1454	}
				1455
				1456	static bool memslot_is_readonly(struct kvm_memory_slot *slot)
				1457	{
				1458	return slot->flags & KVM_MEM_READONLY;
				1459	}
				1460
				1461	static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
				1462	gfn_t *nr_pages, bool write)
				1463	{
				1464	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
				1465	return KVM_HVA_ERR_BAD;
				1466
				1467	if (memslot_is_readonly(slot) && write)
				1468	return KVM_HVA_ERR_RO_BAD;
				1469
				1470	if (nr_pages)
				1471	*nr_pages = slot->npages - (gfn - slot->base_gfn);
				1472
				1473	return __gfn_to_hva_memslot(slot, gfn);
				1474	}
				1475
				1476	static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
				1477	gfn_t *nr_pages)
				1478	{
				1479	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
				1480	}
				1481
				1482	unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
				1483	gfn_t gfn)
				1484	{
				1485	return gfn_to_hva_many(slot, gfn, NULL);
				1486	}
				1487	EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
				1488
				1489	unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
				1490	{
				1491	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
				1492	}
				1493	EXPORT_SYMBOL_GPL(gfn_to_hva);
				1494
				1495	unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
				1496	{
				1497	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
				1498	}
				1499	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
				1500
				1501	/*
				1502	* Return the hva of a @gfn and the R/W attribute if possible.
				1503	*
				1504	* @slot: the kvm_memory_slot which contains @gfn
				1505	* @gfn: the gfn to be translated
				1506	* @writable: used to return the read/write attribute of the @slot if the hva
				1507	* is valid and @writable is not NULL
				1508	*/
				1509	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
				1510	gfn_t gfn, bool *writable)
				1511	{
				1512	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
				1513
				1514	if (!kvm_is_error_hva(hva) && writable)
				1515	*writable = !memslot_is_readonly(slot);
				1516
				1517	return hva;
				1518	}
				1519
				1520	unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)
				1521	{
				1522	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1523
				1524	return gfn_to_hva_memslot_prot(slot, gfn, writable);
				1525	}
				1526
				1527	unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu vcpu, gfn_t gfn, bool writable)
				1528	{
				1529	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1530
				1531	return gfn_to_hva_memslot_prot(slot, gfn, writable);
				1532	}
				1533
				1534	static inline int check_user_page_hwpoison(unsigned long addr)
				1535	{
				1536	int rc, flags = FOLL_HWPOISON \| FOLL_WRITE;
				1537
				1538	rc = get_user_pages(addr, 1, flags, NULL, NULL);
				1539	return rc == -EHWPOISON;
				1540	}
				1541
				1542	/*
				1543	* The fast path to get the writable pfn which will be stored in @pfn,
				1544	* true indicates success, otherwise false is returned. It's also the
				1545	* only part that runs if we can are in atomic context.
				1546	*/
				1547	static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
				1548	bool writable, kvm_pfn_t pfn)
				1549	{
				1550	struct page *page[1];
				1551	int npages;
				1552
				1553	/*
				1554	* Fast pin a writable pfn only if it is a write fault request
				1555	* or the caller allows to map a writable pfn for a read fault
				1556	* request.
				1557	*/
				1558	if (!(write_fault \|\| writable))
				1559	return false;
				1560
				1561	npages = __get_user_pages_fast(addr, 1, 1, page);
				1562	if (npages == 1) {
				1563	*pfn = page_to_pfn(page[0]);
				1564
				1565	if (writable)
				1566	*writable = true;
				1567	return true;
				1568	}
				1569
				1570	return false;
				1571	}
				1572
				1573	/*
				1574	* The slow path to get the pfn of the specified host virtual address,
				1575	* 1 indicates success, -errno is returned if error is detected.
				1576	*/
				1577	static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
				1578	bool writable, kvm_pfn_t pfn)
				1579	{
				1580	unsigned int flags = FOLL_HWPOISON;
				1581	struct page *page;
				1582	int npages = 0;
				1583
				1584	might_sleep();
				1585
				1586	if (writable)
				1587	*writable = write_fault;
				1588
				1589	if (write_fault)
				1590	flags \|= FOLL_WRITE;
				1591	if (async)
				1592	flags \|= FOLL_NOWAIT;
				1593
				1594	npages = get_user_pages_unlocked(addr, 1, &page, flags);
				1595	if (npages != 1)
				1596	return npages;
				1597
				1598	/* map read fault as writable if possible */
				1599	if (unlikely(!write_fault) && writable) {
				1600	struct page *wpage;
				1601
				1602	if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
				1603	*writable = true;
				1604	put_page(page);
				1605	page = wpage;
				1606	}
				1607	}
				1608	*pfn = page_to_pfn(page);
				1609	return npages;
				1610	}
				1611
				1612	static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
				1613	{
				1614	if (unlikely(!(vma->vm_flags & VM_READ)))
				1615	return false;
				1616
				1617	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
				1618	return false;
				1619
				1620	return true;
				1621	}
				1622
				1623	static int kvm_try_get_pfn(kvm_pfn_t pfn)
				1624	{
				1625	if (kvm_is_reserved_pfn(pfn))
				1626	return 1;
				1627	return get_page_unless_zero(pfn_to_page(pfn));
				1628	}
				1629
				1630	static int hva_to_pfn_remapped(struct vm_area_struct *vma,
				1631	unsigned long addr, bool *async,
				1632	bool write_fault, bool *writable,
				1633	kvm_pfn_t *p_pfn)
				1634	{
				1635	kvm_pfn_t pfn;
				1636	pte_t *ptep;
				1637	spinlock_t *ptl;
				1638	int r;
				1639
				1640	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
				1641	if (r) {
				1642	/*
				1643	* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
				1644	* not call the fault handler, so do it here.
				1645	*/
				1646	bool unlocked = false;
				1647	r = fixup_user_fault(current, current->mm, addr,
				1648	(write_fault ? FAULT_FLAG_WRITE : 0),
				1649	&unlocked);
				1650	if (unlocked)
				1651	return -EAGAIN;
				1652	if (r)
				1653	return r;
				1654
				1655	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
				1656	if (r)
				1657	return r;
				1658	}
				1659
				1660	if (write_fault && !pte_write(*ptep)) {
				1661	pfn = KVM_PFN_ERR_RO_FAULT;
				1662	goto out;
				1663	}
				1664
				1665	if (writable)
				1666	writable = pte_write(ptep);
				1667	pfn = pte_pfn(*ptep);
				1668
				1669	/*
				1670	* Get a reference here because callers of hva_to_pfn and
				1671	* gfn_to_pfn ultimately call kvm_release_pfn_clean on the
				1672	* returned pfn. This is only needed if the VMA has VM_MIXEDMAP
				1673	* set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
				1674	* simply do nothing for reserved pfns.
				1675	*
				1676	* Whoever called remap_pfn_range is also going to call e.g.
				1677	* unmap_mapping_range before the underlying pages are freed,
				1678	* causing a call to our MMU notifier.
				1679	*
				1680	* Certain IO or PFNMAP mappings can be backed with valid
				1681	* struct pages, but be allocated without refcounting e.g.,
				1682	* tail pages of non-compound higher order allocations, which
				1683	* would then underflow the refcount when the caller does the
				1684	* required put_page. Don't allow those pages here.
				1685	*/
				1686	if (!kvm_try_get_pfn(pfn))
				1687	r = -EFAULT;
				1688
				1689	out:
				1690	pte_unmap_unlock(ptep, ptl);
				1691	*p_pfn = pfn;
				1692
				1693	return r;
				1694	}
				1695
				1696	/*
				1697	* Pin guest page in memory and return its pfn.
				1698	* @addr: host virtual address which maps memory to the guest
				1699	* @atomic: whether this function can sleep
				1700	* @async: whether this function need to wait IO complete if the
				1701	* host page is not in the memory
				1702	* @write_fault: whether we should get a writable host page
				1703	* @writable: whether it allows to map a writable host page for !@write_fault
				1704	*
				1705	* The function will map a writable host page for these two cases:
				1706	* 1): @write_fault = true
				1707	* 2): @write_fault = false && @writable, @writable will tell the caller
				1708	* whether the mapping is writable.
				1709	*/
				1710	static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
				1711	bool write_fault, bool *writable)
				1712	{
				1713	struct vm_area_struct *vma;
				1714	kvm_pfn_t pfn = 0;
				1715	int npages, r;
				1716
				1717	/* we can do it either atomically or asynchronously, not both */
				1718	BUG_ON(atomic && async);
				1719
				1720	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
				1721	return pfn;
				1722
				1723	if (atomic)
				1724	return KVM_PFN_ERR_FAULT;
				1725
				1726	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
				1727	if (npages == 1)
				1728	return pfn;
				1729
				1730	down_read(&current->mm->mmap_sem);
				1731	if (npages == -EHWPOISON \|\|
				1732	(!async && check_user_page_hwpoison(addr))) {
				1733	pfn = KVM_PFN_ERR_HWPOISON;
				1734	goto exit;
				1735	}
				1736
				1737	retry:
				1738	vma = find_vma_intersection(current->mm, addr, addr + 1);
				1739
				1740	if (vma == NULL)
				1741	pfn = KVM_PFN_ERR_FAULT;
				1742	else if (vma->vm_flags & (VM_IO \| VM_PFNMAP)) {
				1743	r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
				1744	if (r == -EAGAIN)
				1745	goto retry;
				1746	if (r < 0)
				1747	pfn = KVM_PFN_ERR_FAULT;
				1748	} else {
				1749	if (async && vma_is_valid(vma, write_fault))
				1750	*async = true;
				1751	pfn = KVM_PFN_ERR_FAULT;
				1752	}
				1753	exit:
				1754	up_read(&current->mm->mmap_sem);
				1755	return pfn;
				1756	}
				1757
				1758	kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
				1759	bool atomic, bool *async, bool write_fault,
				1760	bool *writable)
				1761	{
				1762	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
				1763
				1764	if (addr == KVM_HVA_ERR_RO_BAD) {
				1765	if (writable)
				1766	*writable = false;
				1767	return KVM_PFN_ERR_RO_FAULT;
				1768	}
				1769
				1770	if (kvm_is_error_hva(addr)) {
				1771	if (writable)
				1772	*writable = false;
				1773	return KVM_PFN_NOSLOT;
				1774	}
				1775
				1776	/* Do not map writable pfn in the readonly memslot. */
				1777	if (writable && memslot_is_readonly(slot)) {
				1778	*writable = false;
				1779	writable = NULL;
				1780	}
				1781
				1782	return hva_to_pfn(addr, atomic, async, write_fault,
				1783	writable);
				1784	}
				1785	EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
				1786
				1787	kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
				1788	bool *writable)
				1789	{
				1790	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
				1791	write_fault, writable);
				1792	}
				1793	EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
				1794
				1795	kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
				1796	{
				1797	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
				1798	}
				1799	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
				1800
				1801	kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
				1802	{
				1803	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
				1804	}
				1805	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
				1806
				1807	kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
				1808	{
				1809	return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
				1810	}
				1811	EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
				1812
				1813	kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
				1814	{
				1815	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
				1816	}
				1817	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
				1818
				1819	kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
				1820	{
				1821	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
				1822	}
				1823	EXPORT_SYMBOL_GPL(gfn_to_pfn);
				1824
				1825	kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				1826	{
				1827	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
				1828	}
				1829	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
				1830
				1831	int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
				1832	struct page **pages, int nr_pages)
				1833	{
				1834	unsigned long addr;
				1835	gfn_t entry = 0;
				1836
				1837	addr = gfn_to_hva_many(slot, gfn, &entry);
				1838	if (kvm_is_error_hva(addr))
				1839	return -1;
				1840
				1841	if (entry < nr_pages)
				1842	return 0;
				1843
				1844	return __get_user_pages_fast(addr, nr_pages, 1, pages);
				1845	}
				1846	EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
				1847
				1848	static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
				1849	{
				1850	if (is_error_noslot_pfn(pfn))
				1851	return KVM_ERR_PTR_BAD_PAGE;
				1852
				1853	if (kvm_is_reserved_pfn(pfn)) {
				1854	WARN_ON(1);
				1855	return KVM_ERR_PTR_BAD_PAGE;
				1856	}
				1857
				1858	return pfn_to_page(pfn);
				1859	}
				1860
				1861	struct page gfn_to_page(struct kvm kvm, gfn_t gfn)
				1862	{
				1863	kvm_pfn_t pfn;
				1864
				1865	pfn = gfn_to_pfn(kvm, gfn);
				1866
				1867	return kvm_pfn_to_page(pfn);
				1868	}
				1869	EXPORT_SYMBOL_GPL(gfn_to_page);
				1870
				1871	void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
				1872	{
				1873	if (pfn == 0)
				1874	return;
				1875
				1876	if (cache)
				1877	cache->pfn = cache->gfn = 0;
				1878
				1879	if (dirty)
				1880	kvm_release_pfn_dirty(pfn);
				1881	else
				1882	kvm_release_pfn_clean(pfn);
				1883	}
				1884
				1885	static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
				1886	struct gfn_to_pfn_cache *cache, u64 gen)
				1887	{
				1888	kvm_release_pfn(cache->pfn, cache->dirty, cache);
				1889
				1890	cache->pfn = gfn_to_pfn_memslot(slot, gfn);
				1891	cache->gfn = gfn;
				1892	cache->dirty = false;
				1893	cache->generation = gen;
				1894	}
				1895
				1896	static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
				1897	struct kvm_host_map *map,
				1898	struct gfn_to_pfn_cache *cache,
				1899	bool atomic)
				1900	{
				1901	kvm_pfn_t pfn;
				1902	void *hva = NULL;
				1903	struct page *page = KVM_UNMAPPED_PAGE;
				1904	struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
				1905	u64 gen = slots->generation;
				1906
				1907	if (!map)
				1908	return -EINVAL;
				1909
				1910	if (cache) {
				1911	if (!cache->pfn \|\| cache->gfn != gfn \|\|
				1912	cache->generation != gen) {
				1913	if (atomic)
				1914	return -EAGAIN;
				1915	kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
				1916	}
				1917	pfn = cache->pfn;
				1918	} else {
				1919	if (atomic)
				1920	return -EAGAIN;
				1921	pfn = gfn_to_pfn_memslot(slot, gfn);
				1922	}
				1923	if (is_error_noslot_pfn(pfn))
				1924	return -EINVAL;
				1925
				1926	if (pfn_valid(pfn)) {
				1927	page = pfn_to_page(pfn);
				1928	if (atomic)
				1929	hva = kmap_atomic(page);
				1930	else
				1931	hva = kmap(page);
				1932	#ifdef CONFIG_HAS_IOMEM
				1933	} else if (!atomic) {
				1934	hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
				1935	} else {
				1936	return -EINVAL;
				1937	#endif
				1938	}
				1939
				1940	if (!hva)
				1941	return -EFAULT;
				1942
				1943	map->page = page;
				1944	map->hva = hva;
				1945	map->pfn = pfn;
				1946	map->gfn = gfn;
				1947
				1948	return 0;
				1949	}
				1950
				1951	int kvm_map_gfn(struct kvm_vcpu vcpu, gfn_t gfn, struct kvm_host_map map,
				1952	struct gfn_to_pfn_cache *cache, bool atomic)
				1953	{
				1954	return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
				1955	cache, atomic);
				1956	}
				1957	EXPORT_SYMBOL_GPL(kvm_map_gfn);
				1958
				1959	int kvm_vcpu_map(struct kvm_vcpu vcpu, gfn_t gfn, struct kvm_host_map map)
				1960	{
				1961	return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
				1962	NULL, false);
				1963	}
				1964	EXPORT_SYMBOL_GPL(kvm_vcpu_map);
				1965
				1966	static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
				1967	struct kvm_host_map *map,
				1968	struct gfn_to_pfn_cache *cache,
				1969	bool dirty, bool atomic)
				1970	{
				1971	if (!map)
				1972	return;
				1973
				1974	if (!map->hva)
				1975	return;
				1976
				1977	if (map->page != KVM_UNMAPPED_PAGE) {
				1978	if (atomic)
				1979	kunmap_atomic(map->hva);
				1980	else
				1981	kunmap(map->page);
				1982	}
				1983	#ifdef CONFIG_HAS_IOMEM
				1984	else if (!atomic)
				1985	memunmap(map->hva);
				1986	else
				1987	WARN_ONCE(1, "Unexpected unmapping in atomic context");
				1988	#endif
				1989
				1990	if (dirty)
				1991	mark_page_dirty_in_slot(memslot, map->gfn);
				1992
				1993	if (cache)
				1994	cache->dirty \|= dirty;
				1995	else
				1996	kvm_release_pfn(map->pfn, dirty, NULL);
				1997
				1998	map->hva = NULL;
				1999	map->page = NULL;
				2000	}
				2001
				2002	int kvm_unmap_gfn(struct kvm_vcpu vcpu, struct kvm_host_map map,
				2003	struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
				2004	{
				2005	__kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
				2006	cache, dirty, atomic);
				2007	return 0;
				2008	}
				2009	EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
				2010
				2011	void kvm_vcpu_unmap(struct kvm_vcpu vcpu, struct kvm_host_map map, bool dirty)
				2012	{
				2013	__kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
				2014	dirty, false);
				2015	}
				2016	EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
				2017
				2018	struct page kvm_vcpu_gfn_to_page(struct kvm_vcpu vcpu, gfn_t gfn)
				2019	{
				2020	kvm_pfn_t pfn;
				2021
				2022	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
				2023
				2024	return kvm_pfn_to_page(pfn);
				2025	}
				2026	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
				2027
				2028	void kvm_release_page_clean(struct page *page)
				2029	{
				2030	WARN_ON(is_error_page(page));
				2031
				2032	kvm_release_pfn_clean(page_to_pfn(page));
				2033	}
				2034	EXPORT_SYMBOL_GPL(kvm_release_page_clean);
				2035
				2036	void kvm_release_pfn_clean(kvm_pfn_t pfn)
				2037	{
				2038	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
				2039	put_page(pfn_to_page(pfn));
				2040	}
				2041	EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
				2042
				2043	void kvm_release_page_dirty(struct page *page)
				2044	{
				2045	WARN_ON(is_error_page(page));
				2046
				2047	kvm_release_pfn_dirty(page_to_pfn(page));
				2048	}
				2049	EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
				2050
				2051	void kvm_release_pfn_dirty(kvm_pfn_t pfn)
				2052	{
				2053	kvm_set_pfn_dirty(pfn);
				2054	kvm_release_pfn_clean(pfn);
				2055	}
				2056	EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
				2057
				2058	void kvm_set_pfn_dirty(kvm_pfn_t pfn)
				2059	{
				2060	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
				2061	struct page *page = pfn_to_page(pfn);
				2062
				2063	SetPageDirty(page);
				2064	}
				2065	}
				2066	EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
				2067
				2068	void kvm_set_pfn_accessed(kvm_pfn_t pfn)
				2069	{
				2070	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
				2071	mark_page_accessed(pfn_to_page(pfn));
				2072	}
				2073	EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
				2074
				2075	void kvm_get_pfn(kvm_pfn_t pfn)
				2076	{
				2077	if (!kvm_is_reserved_pfn(pfn))
				2078	get_page(pfn_to_page(pfn));
				2079	}
				2080	EXPORT_SYMBOL_GPL(kvm_get_pfn);
				2081
				2082	static int next_segment(unsigned long len, int offset)
				2083	{
				2084	if (len > PAGE_SIZE - offset)
				2085	return PAGE_SIZE - offset;
				2086	else
				2087	return len;
				2088	}
				2089
				2090	static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
				2091	void *data, int offset, int len)
				2092	{
				2093	int r;
				2094	unsigned long addr;
				2095
				2096	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
				2097	if (kvm_is_error_hva(addr))
				2098	return -EFAULT;
				2099	r = __copy_from_user(data, (void __user *)addr + offset, len);
				2100	if (r)
				2101	return -EFAULT;
				2102	return 0;
				2103	}
				2104
				2105	int kvm_read_guest_page(struct kvm kvm, gfn_t gfn, void data, int offset,
				2106	int len)
				2107	{
				2108	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				2109
				2110	return __kvm_read_guest_page(slot, gfn, data, offset, len);
				2111	}
				2112	EXPORT_SYMBOL_GPL(kvm_read_guest_page);
				2113
				2114	int kvm_vcpu_read_guest_page(struct kvm_vcpu vcpu, gfn_t gfn, void data,
				2115	int offset, int len)
				2116	{
				2117	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				2118
				2119	return __kvm_read_guest_page(slot, gfn, data, offset, len);
				2120	}
				2121	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
				2122
				2123	int kvm_read_guest(struct kvm kvm, gpa_t gpa, void data, unsigned long len)
				2124	{
				2125	gfn_t gfn = gpa >> PAGE_SHIFT;
				2126	int seg;
				2127	int offset = offset_in_page(gpa);
				2128	int ret;
				2129
				2130	while ((seg = next_segment(len, offset)) != 0) {
				2131	ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
				2132	if (ret < 0)
				2133	return ret;
				2134	offset = 0;
				2135	len -= seg;
				2136	data += seg;
				2137	++gfn;
				2138	}
				2139	return 0;
				2140	}
				2141	EXPORT_SYMBOL_GPL(kvm_read_guest);
				2142
				2143	int kvm_vcpu_read_guest(struct kvm_vcpu vcpu, gpa_t gpa, void data, unsigned long len)
				2144	{
				2145	gfn_t gfn = gpa >> PAGE_SHIFT;
				2146	int seg;
				2147	int offset = offset_in_page(gpa);
				2148	int ret;
				2149
				2150	while ((seg = next_segment(len, offset)) != 0) {
				2151	ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
				2152	if (ret < 0)
				2153	return ret;
				2154	offset = 0;
				2155	len -= seg;
				2156	data += seg;
				2157	++gfn;
				2158	}
				2159	return 0;
				2160	}
				2161	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
				2162
				2163	static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
				2164	void *data, int offset, unsigned long len)
				2165	{
				2166	int r;
				2167	unsigned long addr;
				2168
				2169	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
				2170	if (kvm_is_error_hva(addr))
				2171	return -EFAULT;
				2172	pagefault_disable();
				2173	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
				2174	pagefault_enable();
				2175	if (r)
				2176	return -EFAULT;
				2177	return 0;
				2178	}
				2179
				2180	int kvm_read_guest_atomic(struct kvm kvm, gpa_t gpa, void data,
				2181	unsigned long len)
				2182	{
				2183	gfn_t gfn = gpa >> PAGE_SHIFT;
				2184	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				2185	int offset = offset_in_page(gpa);
				2186
				2187	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
				2188	}
				2189	EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
				2190
				2191	int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
				2192	void *data, unsigned long len)
				2193	{
				2194	gfn_t gfn = gpa >> PAGE_SHIFT;
				2195	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				2196	int offset = offset_in_page(gpa);
				2197
				2198	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
				2199	}
				2200	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
				2201
				2202	static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
				2203	const void *data, int offset, int len)
				2204	{
				2205	int r;
				2206	unsigned long addr;
				2207
				2208	addr = gfn_to_hva_memslot(memslot, gfn);
				2209	if (kvm_is_error_hva(addr))
				2210	return -EFAULT;
				2211	r = __copy_to_user((void __user *)addr + offset, data, len);
				2212	if (r)
				2213	return -EFAULT;
				2214	mark_page_dirty_in_slot(memslot, gfn);
				2215	return 0;
				2216	}
				2217
				2218	int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
				2219	const void *data, int offset, int len)
				2220	{
				2221	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				2222
				2223	return __kvm_write_guest_page(slot, gfn, data, offset, len);
				2224	}
				2225	EXPORT_SYMBOL_GPL(kvm_write_guest_page);
				2226
				2227	int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
				2228	const void *data, int offset, int len)
				2229	{
				2230	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				2231
				2232	return __kvm_write_guest_page(slot, gfn, data, offset, len);
				2233	}
				2234	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
				2235
				2236	int kvm_write_guest(struct kvm kvm, gpa_t gpa, const void data,
				2237	unsigned long len)
				2238	{
				2239	gfn_t gfn = gpa >> PAGE_SHIFT;
				2240	int seg;
				2241	int offset = offset_in_page(gpa);
				2242	int ret;
				2243
				2244	while ((seg = next_segment(len, offset)) != 0) {
				2245	ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
				2246	if (ret < 0)
				2247	return ret;
				2248	offset = 0;
				2249	len -= seg;
				2250	data += seg;
				2251	++gfn;
				2252	}
				2253	return 0;
				2254	}
				2255	EXPORT_SYMBOL_GPL(kvm_write_guest);
				2256
				2257	int kvm_vcpu_write_guest(struct kvm_vcpu vcpu, gpa_t gpa, const void data,
				2258	unsigned long len)
				2259	{
				2260	gfn_t gfn = gpa >> PAGE_SHIFT;
				2261	int seg;
				2262	int offset = offset_in_page(gpa);
				2263	int ret;
				2264
				2265	while ((seg = next_segment(len, offset)) != 0) {
				2266	ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
				2267	if (ret < 0)
				2268	return ret;
				2269	offset = 0;
				2270	len -= seg;
				2271	data += seg;
				2272	++gfn;
				2273	}
				2274	return 0;
				2275	}
				2276	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
				2277
				2278	static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				2279	struct gfn_to_hva_cache *ghc,
				2280	gpa_t gpa, unsigned long len)
				2281	{
				2282	int offset = offset_in_page(gpa);
				2283	gfn_t start_gfn = gpa >> PAGE_SHIFT;
				2284	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
				2285	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
				2286	gfn_t nr_pages_avail;
				2287	int r = start_gfn <= end_gfn ? 0 : -EINVAL;
				2288
				2289	ghc->gpa = gpa;
				2290	ghc->generation = slots->generation;
				2291	ghc->len = len;
				2292	ghc->hva = KVM_HVA_ERR_BAD;
				2293
				2294	/*
				2295	* If the requested region crosses two memslots, we still
				2296	* verify that the entire region is valid here.
				2297	*/
				2298	while (!r && start_gfn <= end_gfn) {
				2299	ghc->memslot = __gfn_to_memslot(slots, start_gfn);
				2300	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
				2301	&nr_pages_avail);
				2302	if (kvm_is_error_hva(ghc->hva))
				2303	r = -EFAULT;
				2304	start_gfn += nr_pages_avail;
				2305	}
				2306
				2307	/* Use the slow path for cross page reads and writes. */
				2308	if (!r && nr_pages_needed == 1)
				2309	ghc->hva += offset;
				2310	else
				2311	ghc->memslot = NULL;
				2312
				2313	return r;
				2314	}
				2315
				2316	int kvm_gfn_to_hva_cache_init(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2317	gpa_t gpa, unsigned long len)
				2318	{
				2319	struct kvm_memslots *slots = kvm_memslots(kvm);
				2320	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
				2321	}
				2322	EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
				2323
				2324	int kvm_write_guest_offset_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2325	void *data, unsigned int offset,
				2326	unsigned long len)
				2327	{
				2328	struct kvm_memslots *slots = kvm_memslots(kvm);
				2329	int r;
				2330	gpa_t gpa = ghc->gpa + offset;
				2331
				2332	BUG_ON(len + offset > ghc->len);
				2333
				2334	if (slots->generation != ghc->generation)
				2335	__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
				2336
				2337	if (kvm_is_error_hva(ghc->hva))
				2338	return -EFAULT;
				2339
				2340	if (unlikely(!ghc->memslot))
				2341	return kvm_write_guest(kvm, gpa, data, len);
				2342
				2343	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
				2344	if (r)
				2345	return -EFAULT;
				2346	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
				2347
				2348	return 0;
				2349	}
				2350	EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
				2351
				2352	int kvm_write_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2353	void *data, unsigned long len)
				2354	{
				2355	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
				2356	}
				2357	EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
				2358
				2359	int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2360	void *data, unsigned long len)
				2361	{
				2362	struct kvm_memslots *slots = kvm_memslots(kvm);
				2363	int r;
				2364
				2365	BUG_ON(len > ghc->len);
				2366
				2367	if (slots->generation != ghc->generation)
				2368	__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
				2369
				2370	if (kvm_is_error_hva(ghc->hva))
				2371	return -EFAULT;
				2372
				2373	if (unlikely(!ghc->memslot))
				2374	return kvm_read_guest(kvm, ghc->gpa, data, len);
				2375
				2376	r = __copy_from_user(data, (void __user *)ghc->hva, len);
				2377	if (r)
				2378	return -EFAULT;
				2379
				2380	return 0;
				2381	}
				2382	EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
				2383
				2384	int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
				2385	{
				2386	const void zero_page = (const void ) __va(page_to_phys(ZERO_PAGE(0)));
				2387
				2388	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
				2389	}
				2390	EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
				2391
				2392	int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
				2393	{
				2394	gfn_t gfn = gpa >> PAGE_SHIFT;
				2395	int seg;
				2396	int offset = offset_in_page(gpa);
				2397	int ret;
				2398
				2399	while ((seg = next_segment(len, offset)) != 0) {
				2400	ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
				2401	if (ret < 0)
				2402	return ret;
				2403	offset = 0;
				2404	len -= seg;
				2405	++gfn;
				2406	}
				2407	return 0;
				2408	}
				2409	EXPORT_SYMBOL_GPL(kvm_clear_guest);
				2410
				2411	static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
				2412	gfn_t gfn)
				2413	{
				2414	if (memslot && memslot->dirty_bitmap) {
				2415	unsigned long rel_gfn = gfn - memslot->base_gfn;
				2416
				2417	set_bit_le(rel_gfn, memslot->dirty_bitmap);
				2418	}
				2419	}
				2420
				2421	void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
				2422	{
				2423	struct kvm_memory_slot *memslot;
				2424
				2425	memslot = gfn_to_memslot(kvm, gfn);
				2426	mark_page_dirty_in_slot(memslot, gfn);
				2427	}
				2428	EXPORT_SYMBOL_GPL(mark_page_dirty);
				2429
				2430	void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
				2431	{
				2432	struct kvm_memory_slot *memslot;
				2433
				2434	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				2435	mark_page_dirty_in_slot(memslot, gfn);
				2436	}
				2437	EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
				2438
				2439	void kvm_sigset_activate(struct kvm_vcpu *vcpu)
				2440	{
				2441	if (!vcpu->sigset_active)
				2442	return;
				2443
				2444	/*
				2445	* This does a lockless modification of ->real_blocked, which is fine
				2446	* because, only current can change ->real_blocked and all readers of
				2447	* ->real_blocked don't care as long ->real_blocked is always a subset
				2448	* of ->blocked.
				2449	*/
				2450	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
				2451	}
				2452
				2453	void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
				2454	{
				2455	if (!vcpu->sigset_active)
				2456	return;
				2457
				2458	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
				2459	sigemptyset(&current->real_blocked);
				2460	}
				2461
				2462	static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
				2463	{
				2464	unsigned int old, val, grow, grow_start;
				2465
				2466	old = val = vcpu->halt_poll_ns;
				2467	grow_start = READ_ONCE(halt_poll_ns_grow_start);
				2468	grow = READ_ONCE(halt_poll_ns_grow);
				2469	if (!grow)
				2470	goto out;
				2471
				2472	val *= grow;
				2473	if (val < grow_start)
				2474	val = grow_start;
				2475
				2476	if (val > halt_poll_ns)
				2477	val = halt_poll_ns;
				2478
				2479	vcpu->halt_poll_ns = val;
				2480	out:
				2481	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
				2482	}
				2483
				2484	static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
				2485	{
				2486	unsigned int old, val, shrink, grow_start;
				2487
				2488	old = val = vcpu->halt_poll_ns;
				2489	shrink = READ_ONCE(halt_poll_ns_shrink);
				2490	grow_start = READ_ONCE(halt_poll_ns_grow_start);
				2491	if (shrink == 0)
				2492	val = 0;
				2493	else
				2494	val /= shrink;
				2495
				2496	if (val < grow_start)
				2497	val = 0;
				2498
				2499	vcpu->halt_poll_ns = val;
				2500	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
				2501	}
				2502
				2503	static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
				2504	{
				2505	int ret = -EINTR;
				2506	int idx = srcu_read_lock(&vcpu->kvm->srcu);
				2507
				2508	if (kvm_arch_vcpu_runnable(vcpu)) {
				2509	kvm_make_request(KVM_REQ_UNHALT, vcpu);
				2510	goto out;
				2511	}
				2512	if (kvm_cpu_has_pending_timer(vcpu))
				2513	goto out;
				2514	if (signal_pending(current))
				2515	goto out;
				2516
				2517	ret = 0;
				2518	out:
				2519	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				2520	return ret;
				2521	}
				2522
				2523	/*
				2524	* The vCPU has executed a HLT instruction with in-kernel mode enabled.
				2525	*/
				2526	void kvm_vcpu_block(struct kvm_vcpu *vcpu)
				2527	{
				2528	ktime_t start, cur;
				2529	DECLARE_SWAITQUEUE(wait);
				2530	bool waited = false;
				2531	u64 block_ns;
				2532
				2533	kvm_arch_vcpu_blocking(vcpu);
				2534
				2535	start = cur = ktime_get();
				2536	if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
				2537	ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
				2538
				2539	++vcpu->stat.halt_attempted_poll;
				2540	do {
				2541	/*
				2542	* This sets KVM_REQ_UNHALT if an interrupt
				2543	* arrives.
				2544	*/
				2545	if (kvm_vcpu_check_block(vcpu) < 0) {
				2546	++vcpu->stat.halt_successful_poll;
				2547	if (!vcpu_valid_wakeup(vcpu))
				2548	++vcpu->stat.halt_poll_invalid;
				2549	goto out;
				2550	}
				2551	cur = ktime_get();
				2552	} while (single_task_running() && ktime_before(cur, stop));
				2553	}
				2554
				2555	for (;;) {
				2556	prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
				2557
				2558	if (kvm_vcpu_check_block(vcpu) < 0)
				2559	break;
				2560
				2561	waited = true;
				2562	schedule();
				2563	}
				2564
				2565	finish_swait(&vcpu->wq, &wait);
				2566	cur = ktime_get();
				2567	out:
				2568	kvm_arch_vcpu_unblocking(vcpu);
				2569	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
				2570
				2571	if (!kvm_arch_no_poll(vcpu)) {
				2572	if (!vcpu_valid_wakeup(vcpu)) {
				2573	shrink_halt_poll_ns(vcpu);
				2574	} else if (halt_poll_ns) {
				2575	if (block_ns <= vcpu->halt_poll_ns)
				2576	;
				2577	/* we had a long block, shrink polling */
				2578	else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
				2579	shrink_halt_poll_ns(vcpu);
				2580	/* we had a short halt and our poll time is too small */
				2581	else if (vcpu->halt_poll_ns < halt_poll_ns &&
				2582	block_ns < halt_poll_ns)
				2583	grow_halt_poll_ns(vcpu);
				2584	} else {
				2585	vcpu->halt_poll_ns = 0;
				2586	}
				2587	}
				2588
				2589	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
				2590	kvm_arch_vcpu_block_finish(vcpu);
				2591	}
				2592	EXPORT_SYMBOL_GPL(kvm_vcpu_block);
				2593
				2594	bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
				2595	{
				2596	struct swait_queue_head *wqp;
				2597
				2598	wqp = kvm_arch_vcpu_wq(vcpu);
				2599	if (swq_has_sleeper(wqp)) {
				2600	swake_up_one(wqp);
				2601	WRITE_ONCE(vcpu->ready, true);
				2602	++vcpu->stat.halt_wakeup;
				2603	return true;
				2604	}
				2605
				2606	return false;
				2607	}
				2608	EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
				2609
				2610	#ifndef CONFIG_S390
				2611	/*
				2612	* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
				2613	*/
				2614	void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
				2615	{
				2616	int me;
				2617	int cpu = vcpu->cpu;
				2618
				2619	if (kvm_vcpu_wake_up(vcpu))
				2620	return;
				2621
				2622	me = get_cpu();
				2623	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
				2624	if (kvm_arch_vcpu_should_kick(vcpu))
				2625	smp_send_reschedule(cpu);
				2626	put_cpu();
				2627	}
				2628	EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
				2629	#endif /* !CONFIG_S390 */
				2630
				2631	int kvm_vcpu_yield_to(struct kvm_vcpu *target)
				2632	{
				2633	struct pid *pid;
				2634	struct task_struct *task = NULL;
				2635	int ret = 0;
				2636
				2637	rcu_read_lock();
				2638	pid = rcu_dereference(target->pid);
				2639	if (pid)
				2640	task = get_pid_task(pid, PIDTYPE_PID);
				2641	rcu_read_unlock();
				2642	if (!task)
				2643	return ret;
				2644	ret = yield_to(task, 1);
				2645	put_task_struct(task);
				2646
				2647	return ret;
				2648	}
				2649	EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
				2650
				2651	/*
				2652	* Helper that checks whether a VCPU is eligible for directed yield.
				2653	* Most eligible candidate to yield is decided by following heuristics:
				2654	*
				2655	* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
				2656	* (preempted lock holder), indicated by @in_spin_loop.
				2657	* Set at the beiginning and cleared at the end of interception/PLE handler.
				2658	*
				2659	* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
				2660	* chance last time (mostly it has become eligible now since we have probably
				2661	* yielded to lockholder in last iteration. This is done by toggling
				2662	* @dy_eligible each time a VCPU checked for eligibility.)
				2663	*
				2664	* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
				2665	* to preempted lock-holder could result in wrong VCPU selection and CPU
				2666	* burning. Giving priority for a potential lock-holder increases lock
				2667	* progress.
				2668	*
				2669	* Since algorithm is based on heuristics, accessing another VCPU data without
				2670	* locking does not harm. It may result in trying to yield to same VCPU, fail
				2671	* and continue with next VCPU and so on.
				2672	*/
				2673	static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
				2674	{
				2675	#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
				2676	bool eligible;
				2677
				2678	eligible = !vcpu->spin_loop.in_spin_loop \|\|
				2679	vcpu->spin_loop.dy_eligible;
				2680
				2681	if (vcpu->spin_loop.in_spin_loop)
				2682	kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
				2683
				2684	return eligible;
				2685	#else
				2686	return true;
				2687	#endif
				2688	}
				2689
				2690	/*
				2691	* Unlike kvm_arch_vcpu_runnable, this function is called outside
				2692	* a vcpu_load/vcpu_put pair. However, for most architectures
				2693	* kvm_arch_vcpu_runnable does not require vcpu_load.
				2694	*/
				2695	bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
				2696	{
				2697	return kvm_arch_vcpu_runnable(vcpu);
				2698	}
				2699
				2700	static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
				2701	{
				2702	if (kvm_arch_dy_runnable(vcpu))
				2703	return true;
				2704
				2705	#ifdef CONFIG_KVM_ASYNC_PF
				2706	if (!list_empty_careful(&vcpu->async_pf.done))
				2707	return true;
				2708	#endif
				2709
				2710	return false;
				2711	}
				2712
				2713	void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
				2714	{
				2715	struct kvm *kvm = me->kvm;
				2716	struct kvm_vcpu *vcpu;
				2717	int last_boosted_vcpu;
				2718	int yielded = 0;
				2719	int try = 3;
				2720	int pass;
				2721	int i;
				2722
				2723	last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
				2724	kvm_vcpu_set_in_spin_loop(me, true);
				2725	/*
				2726	* We boost the priority of a VCPU that is runnable but not
				2727	* currently running, because it got preempted by something
				2728	* else and called schedule in __vcpu_run. Hopefully that
				2729	* VCPU is holding the lock that we need and will release it.
				2730	* We approximate round-robin by starting at the last boosted VCPU.
				2731	*/
				2732	for (pass = 0; pass < 2 && !yielded && try; pass++) {
				2733	kvm_for_each_vcpu(i, vcpu, kvm) {
				2734	if (!pass && i <= last_boosted_vcpu) {
				2735	i = last_boosted_vcpu;
				2736	continue;
				2737	} else if (pass && i > last_boosted_vcpu)
				2738	break;
				2739	if (!READ_ONCE(vcpu->ready))
				2740	continue;
				2741	if (vcpu == me)
				2742	continue;
				2743	if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
				2744	continue;
				2745	if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
				2746	!kvm_arch_vcpu_in_kernel(vcpu))
				2747	continue;
				2748	if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
				2749	continue;
				2750
				2751	yielded = kvm_vcpu_yield_to(vcpu);
				2752	if (yielded > 0) {
				2753	WRITE_ONCE(kvm->last_boosted_vcpu, i);
				2754	break;
				2755	} else if (yielded < 0) {
				2756	try--;
				2757	if (!try)
				2758	break;
				2759	}
				2760	}
				2761	}
				2762	kvm_vcpu_set_in_spin_loop(me, false);
				2763
				2764	/* Ensure vcpu is not eligible during next spinloop */
				2765	kvm_vcpu_set_dy_eligible(me, false);
				2766	}
				2767	EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
				2768
				2769	static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
				2770	{
				2771	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
				2772	struct page *page;
				2773
				2774	if (vmf->pgoff == 0)
				2775	page = virt_to_page(vcpu->run);
				2776	#ifdef CONFIG_X86
				2777	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
				2778	page = virt_to_page(vcpu->arch.pio_data);
				2779	#endif
				2780	#ifdef CONFIG_KVM_MMIO
				2781	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
				2782	page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
				2783	#endif
				2784	else
				2785	return kvm_arch_vcpu_fault(vcpu, vmf);
				2786	get_page(page);
				2787	vmf->page = page;
				2788	return 0;
				2789	}
				2790
				2791	static const struct vm_operations_struct kvm_vcpu_vm_ops = {
				2792	.fault = kvm_vcpu_fault,
				2793	};
				2794
				2795	static int kvm_vcpu_mmap(struct file file, struct vm_area_struct vma)
				2796	{
				2797	vma->vm_ops = &kvm_vcpu_vm_ops;
				2798	return 0;
				2799	}
				2800
				2801	static int kvm_vcpu_release(struct inode inode, struct file filp)
				2802	{
				2803	struct kvm_vcpu *vcpu = filp->private_data;
				2804
				2805	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2806	kvm_put_kvm(vcpu->kvm);
				2807	return 0;
				2808	}
				2809
				2810	static struct file_operations kvm_vcpu_fops = {
				2811	.release = kvm_vcpu_release,
				2812	.unlocked_ioctl = kvm_vcpu_ioctl,
				2813	.mmap = kvm_vcpu_mmap,
				2814	.llseek = noop_llseek,
				2815	KVM_COMPAT(kvm_vcpu_compat_ioctl),
				2816	};
				2817
				2818	/*
				2819	* Allocates an inode for the vcpu.
				2820	*/
				2821	static int create_vcpu_fd(struct kvm_vcpu *vcpu)
				2822	{
				2823	char name[8 + 1 + ITOA_MAX_LEN + 1];
				2824
				2825	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
				2826	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR \| O_CLOEXEC);
				2827	}
				2828
				2829	static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
				2830	{
				2831	#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
				2832	char dir_name[ITOA_MAX_LEN * 2];
				2833
				2834	if (!debugfs_initialized())
				2835	return;
				2836
				2837	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
				2838	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
				2839	vcpu->kvm->debugfs_dentry);
				2840
				2841	kvm_arch_create_vcpu_debugfs(vcpu);
				2842	#endif
				2843	}
				2844
				2845	/*
				2846	* Creates some virtual cpus. Good luck creating more than one.
				2847	*/
				2848	static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
				2849	{
				2850	int r;
				2851	struct kvm_vcpu *vcpu;
				2852
				2853	if (id >= KVM_MAX_VCPU_ID)
				2854	return -EINVAL;
				2855
				2856	mutex_lock(&kvm->lock);
				2857	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
				2858	mutex_unlock(&kvm->lock);
				2859	return -EINVAL;
				2860	}
				2861
				2862	kvm->created_vcpus++;
				2863	mutex_unlock(&kvm->lock);
				2864
				2865	vcpu = kvm_arch_vcpu_create(kvm, id);
				2866	if (IS_ERR(vcpu)) {
				2867	r = PTR_ERR(vcpu);
				2868	goto vcpu_decrement;
				2869	}
				2870
				2871	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
				2872
				2873	r = kvm_arch_vcpu_setup(vcpu);
				2874	if (r)
				2875	goto vcpu_destroy;
				2876
				2877	kvm_create_vcpu_debugfs(vcpu);
				2878
				2879	mutex_lock(&kvm->lock);
				2880	if (kvm_get_vcpu_by_id(kvm, id)) {
				2881	r = -EEXIST;
				2882	goto unlock_vcpu_destroy;
				2883	}
				2884
				2885	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
				2886	BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
				2887
				2888	/* Now it's all set up, let userspace reach it */
				2889	kvm_get_kvm(kvm);
				2890	r = create_vcpu_fd(vcpu);
				2891	if (r < 0) {
				2892	kvm_put_kvm(kvm);
				2893	goto unlock_vcpu_destroy;
				2894	}
				2895
				2896	kvm->vcpus[vcpu->vcpu_idx] = vcpu;
				2897
				2898	/*
				2899	* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
				2900	* before kvm->online_vcpu's incremented value.
				2901	*/
				2902	smp_wmb();
				2903	atomic_inc(&kvm->online_vcpus);
				2904
				2905	mutex_unlock(&kvm->lock);
				2906	kvm_arch_vcpu_postcreate(vcpu);
				2907	return r;
				2908
				2909	unlock_vcpu_destroy:
				2910	mutex_unlock(&kvm->lock);
				2911	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2912	vcpu_destroy:
				2913	kvm_arch_vcpu_destroy(vcpu);
				2914	vcpu_decrement:
				2915	mutex_lock(&kvm->lock);
				2916	kvm->created_vcpus--;
				2917	mutex_unlock(&kvm->lock);
				2918	return r;
				2919	}
				2920
				2921	static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu vcpu, sigset_t sigset)
				2922	{
				2923	if (sigset) {
				2924	sigdelsetmask(sigset, sigmask(SIGKILL)\|sigmask(SIGSTOP));
				2925	vcpu->sigset_active = 1;
				2926	vcpu->sigset = *sigset;
				2927	} else
				2928	vcpu->sigset_active = 0;
				2929	return 0;
				2930	}
				2931
				2932	static long kvm_vcpu_ioctl(struct file *filp,
				2933	unsigned int ioctl, unsigned long arg)
				2934	{
				2935	struct kvm_vcpu *vcpu = filp->private_data;
				2936	void __user argp = (void __user )arg;
				2937	int r;
				2938	struct kvm_fpu *fpu = NULL;
				2939	struct kvm_sregs *kvm_sregs = NULL;
				2940
				2941	if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_bugged)
				2942	return -EIO;
				2943
				2944	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
				2945	return -EINVAL;
				2946
				2947	/*
				2948	* Some architectures have vcpu ioctls that are asynchronous to vcpu
				2949	* execution; mutex_lock() would break them.
				2950	*/
				2951	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
				2952	if (r != -ENOIOCTLCMD)
				2953	return r;
				2954
				2955	if (mutex_lock_killable(&vcpu->mutex))
				2956	return -EINTR;
				2957	switch (ioctl) {
				2958	case KVM_RUN: {
				2959	struct pid *oldpid;
				2960	r = -EINVAL;
				2961	if (arg)
				2962	goto out;
				2963	oldpid = rcu_access_pointer(vcpu->pid);
				2964	if (unlikely(oldpid != task_pid(current))) {
				2965	/* The thread running this VCPU changed. */
				2966	struct pid *newpid;
				2967
				2968	r = kvm_arch_vcpu_run_pid_change(vcpu);
				2969	if (r)
				2970	break;
				2971
				2972	newpid = get_task_pid(current, PIDTYPE_PID);
				2973	rcu_assign_pointer(vcpu->pid, newpid);
				2974	if (oldpid)
				2975	synchronize_rcu();
				2976	put_pid(oldpid);
				2977	}
				2978	r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
				2979	trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
				2980	break;
				2981	}
				2982	case KVM_GET_REGS: {
				2983	struct kvm_regs *kvm_regs;
				2984
				2985	r = -ENOMEM;
				2986	kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
				2987	if (!kvm_regs)
				2988	goto out;
				2989	r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
				2990	if (r)
				2991	goto out_free1;
				2992	r = -EFAULT;
				2993	if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
				2994	goto out_free1;
				2995	r = 0;
				2996	out_free1:
				2997	kfree(kvm_regs);
				2998	break;
				2999	}
				3000	case KVM_SET_REGS: {
				3001	struct kvm_regs *kvm_regs;
				3002
				3003	r = -ENOMEM;
				3004	kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
				3005	if (IS_ERR(kvm_regs)) {
				3006	r = PTR_ERR(kvm_regs);
				3007	goto out;
				3008	}
				3009	r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
				3010	kfree(kvm_regs);
				3011	break;
				3012	}
				3013	case KVM_GET_SREGS: {
				3014	kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
				3015	GFP_KERNEL_ACCOUNT);
				3016	r = -ENOMEM;
				3017	if (!kvm_sregs)
				3018	goto out;
				3019	r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
				3020	if (r)
				3021	goto out;
				3022	r = -EFAULT;
				3023	if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
				3024	goto out;
				3025	r = 0;
				3026	break;
				3027	}
				3028	case KVM_SET_SREGS: {
				3029	kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
				3030	if (IS_ERR(kvm_sregs)) {
				3031	r = PTR_ERR(kvm_sregs);
				3032	kvm_sregs = NULL;
				3033	goto out;
				3034	}
				3035	r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
				3036	break;
				3037	}
				3038	case KVM_GET_MP_STATE: {
				3039	struct kvm_mp_state mp_state;
				3040
				3041	r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
				3042	if (r)
				3043	goto out;
				3044	r = -EFAULT;
				3045	if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
				3046	goto out;
				3047	r = 0;
				3048	break;
				3049	}
				3050	case KVM_SET_MP_STATE: {
				3051	struct kvm_mp_state mp_state;
				3052
				3053	r = -EFAULT;
				3054	if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
				3055	goto out;
				3056	r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
				3057	break;
				3058	}
				3059	case KVM_TRANSLATE: {
				3060	struct kvm_translation tr;
				3061
				3062	r = -EFAULT;
				3063	if (copy_from_user(&tr, argp, sizeof(tr)))
				3064	goto out;
				3065	r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
				3066	if (r)
				3067	goto out;
				3068	r = -EFAULT;
				3069	if (copy_to_user(argp, &tr, sizeof(tr)))
				3070	goto out;
				3071	r = 0;
				3072	break;
				3073	}
				3074	case KVM_SET_GUEST_DEBUG: {
				3075	struct kvm_guest_debug dbg;
				3076
				3077	r = -EFAULT;
				3078	if (copy_from_user(&dbg, argp, sizeof(dbg)))
				3079	goto out;
				3080	r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
				3081	break;
				3082	}
				3083	case KVM_SET_SIGNAL_MASK: {
				3084	struct kvm_signal_mask __user *sigmask_arg = argp;
				3085	struct kvm_signal_mask kvm_sigmask;
				3086	sigset_t sigset, *p;
				3087
				3088	p = NULL;
				3089	if (argp) {
				3090	r = -EFAULT;
				3091	if (copy_from_user(&kvm_sigmask, argp,
				3092	sizeof(kvm_sigmask)))
				3093	goto out;
				3094	r = -EINVAL;
				3095	if (kvm_sigmask.len != sizeof(sigset))
				3096	goto out;
				3097	r = -EFAULT;
				3098	if (copy_from_user(&sigset, sigmask_arg->sigset,
				3099	sizeof(sigset)))
				3100	goto out;
				3101	p = &sigset;
				3102	}
				3103	r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
				3104	break;
				3105	}
				3106	case KVM_GET_FPU: {
				3107	fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
				3108	r = -ENOMEM;
				3109	if (!fpu)
				3110	goto out;
				3111	r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
				3112	if (r)
				3113	goto out;
				3114	r = -EFAULT;
				3115	if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
				3116	goto out;
				3117	r = 0;
				3118	break;
				3119	}
				3120	case KVM_SET_FPU: {
				3121	fpu = memdup_user(argp, sizeof(*fpu));
				3122	if (IS_ERR(fpu)) {
				3123	r = PTR_ERR(fpu);
				3124	fpu = NULL;
				3125	goto out;
				3126	}
				3127	r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
				3128	break;
				3129	}
				3130	default:
				3131	r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
				3132	}
				3133	out:
				3134	mutex_unlock(&vcpu->mutex);
				3135	kfree(fpu);
				3136	kfree(kvm_sregs);
				3137	return r;
				3138	}
				3139
				3140	#ifdef CONFIG_KVM_COMPAT
				3141	static long kvm_vcpu_compat_ioctl(struct file *filp,
				3142	unsigned int ioctl, unsigned long arg)
				3143	{
				3144	struct kvm_vcpu *vcpu = filp->private_data;
				3145	void __user *argp = compat_ptr(arg);
				3146	int r;
				3147
				3148	if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_bugged)
				3149	return -EIO;
				3150
				3151	switch (ioctl) {
				3152	case KVM_SET_SIGNAL_MASK: {
				3153	struct kvm_signal_mask __user *sigmask_arg = argp;
				3154	struct kvm_signal_mask kvm_sigmask;
				3155	sigset_t sigset;
				3156
				3157	if (argp) {
				3158	r = -EFAULT;
				3159	if (copy_from_user(&kvm_sigmask, argp,
				3160	sizeof(kvm_sigmask)))
				3161	goto out;
				3162	r = -EINVAL;
				3163	if (kvm_sigmask.len != sizeof(compat_sigset_t))
				3164	goto out;
				3165	r = -EFAULT;
				3166	if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
				3167	goto out;
				3168	r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
				3169	} else
				3170	r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
				3171	break;
				3172	}
				3173	default:
				3174	r = kvm_vcpu_ioctl(filp, ioctl, arg);
				3175	}
				3176
				3177	out:
				3178	return r;
				3179	}
				3180	#endif
				3181
				3182	static int kvm_device_mmap(struct file filp, struct vm_area_struct vma)
				3183	{
				3184	struct kvm_device *dev = filp->private_data;
				3185
				3186	if (dev->ops->mmap)
				3187	return dev->ops->mmap(dev, vma);
				3188
				3189	return -ENODEV;
				3190	}
				3191
				3192	static int kvm_device_ioctl_attr(struct kvm_device *dev,
				3193	int (accessor)(struct kvm_device dev,
				3194	struct kvm_device_attr *attr),
				3195	unsigned long arg)
				3196	{
				3197	struct kvm_device_attr attr;
				3198
				3199	if (!accessor)
				3200	return -EPERM;
				3201
				3202	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
				3203	return -EFAULT;
				3204
				3205	return accessor(dev, &attr);
				3206	}
				3207
				3208	static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
				3209	unsigned long arg)
				3210	{
				3211	struct kvm_device *dev = filp->private_data;
				3212
				3213	if (dev->kvm->mm != current->mm \|\| dev->kvm->vm_bugged)
				3214	return -EIO;
				3215
				3216	switch (ioctl) {
				3217	case KVM_SET_DEVICE_ATTR:
				3218	return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
				3219	case KVM_GET_DEVICE_ATTR:
				3220	return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
				3221	case KVM_HAS_DEVICE_ATTR:
				3222	return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
				3223	default:
				3224	if (dev->ops->ioctl)
				3225	return dev->ops->ioctl(dev, ioctl, arg);
				3226
				3227	return -ENOTTY;
				3228	}
				3229	}
				3230
				3231	static int kvm_device_release(struct inode inode, struct file filp)
				3232	{
				3233	struct kvm_device *dev = filp->private_data;
				3234	struct kvm *kvm = dev->kvm;
				3235
				3236	if (dev->ops->release) {
				3237	mutex_lock(&kvm->lock);
				3238	list_del(&dev->vm_node);
				3239	dev->ops->release(dev);
				3240	mutex_unlock(&kvm->lock);
				3241	}
				3242
				3243	kvm_put_kvm(kvm);
				3244	return 0;
				3245	}
				3246
				3247	static const struct file_operations kvm_device_fops = {
				3248	.unlocked_ioctl = kvm_device_ioctl,
				3249	.release = kvm_device_release,
				3250	KVM_COMPAT(kvm_device_ioctl),
				3251	.mmap = kvm_device_mmap,
				3252	};
				3253
				3254	struct kvm_device kvm_device_from_filp(struct file filp)
				3255	{
				3256	if (filp->f_op != &kvm_device_fops)
				3257	return NULL;
				3258
				3259	return filp->private_data;
				3260	}
				3261
				3262	static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
				3263	#ifdef CONFIG_KVM_MPIC
				3264	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
				3265	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
				3266	#endif
				3267	};
				3268
				3269	int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
				3270	{
				3271	if (type >= ARRAY_SIZE(kvm_device_ops_table))
				3272	return -ENOSPC;
				3273
				3274	if (kvm_device_ops_table[type] != NULL)
				3275	return -EEXIST;
				3276
				3277	kvm_device_ops_table[type] = ops;
				3278	return 0;
				3279	}
				3280
				3281	void kvm_unregister_device_ops(u32 type)
				3282	{
				3283	if (kvm_device_ops_table[type] != NULL)
				3284	kvm_device_ops_table[type] = NULL;
				3285	}
				3286
				3287	static int kvm_ioctl_create_device(struct kvm *kvm,
				3288	struct kvm_create_device *cd)
				3289	{
				3290	struct kvm_device_ops *ops = NULL;
				3291	struct kvm_device *dev;
				3292	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
				3293	int type;
				3294	int ret;
				3295
				3296	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
				3297	return -ENODEV;
				3298
				3299	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
				3300	ops = kvm_device_ops_table[type];
				3301	if (ops == NULL)
				3302	return -ENODEV;
				3303
				3304	if (test)
				3305	return 0;
				3306
				3307	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
				3308	if (!dev)
				3309	return -ENOMEM;
				3310
				3311	dev->ops = ops;
				3312	dev->kvm = kvm;
				3313
				3314	mutex_lock(&kvm->lock);
				3315	ret = ops->create(dev, type);
				3316	if (ret < 0) {
				3317	mutex_unlock(&kvm->lock);
				3318	kfree(dev);
				3319	return ret;
				3320	}
				3321	list_add(&dev->vm_node, &kvm->devices);
				3322	mutex_unlock(&kvm->lock);
				3323
				3324	if (ops->init)
				3325	ops->init(dev);
				3326
				3327	kvm_get_kvm(kvm);
				3328	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR \| O_CLOEXEC);
				3329	if (ret < 0) {
				3330	kvm_put_kvm(kvm);
				3331	mutex_lock(&kvm->lock);
				3332	list_del(&dev->vm_node);
				3333	if (ops->release)
				3334	ops->release(dev);
				3335	mutex_unlock(&kvm->lock);
				3336	if (ops->destroy)
				3337	ops->destroy(dev);
				3338	return ret;
				3339	}
				3340
				3341	cd->fd = ret;
				3342	return 0;
				3343	}
				3344
				3345	static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
				3346	{
				3347	switch (arg) {
				3348	case KVM_CAP_USER_MEMORY:
				3349	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
				3350	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
				3351	case KVM_CAP_INTERNAL_ERROR_DATA:
				3352	#ifdef CONFIG_HAVE_KVM_MSI
				3353	case KVM_CAP_SIGNAL_MSI:
				3354	#endif
				3355	#ifdef CONFIG_HAVE_KVM_IRQFD
				3356	case KVM_CAP_IRQFD:
				3357	case KVM_CAP_IRQFD_RESAMPLE:
				3358	#endif
				3359	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
				3360	case KVM_CAP_CHECK_EXTENSION_VM:
				3361	case KVM_CAP_ENABLE_CAP_VM:
				3362	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				3363	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
				3364	#endif
				3365	return 1;
				3366	#ifdef CONFIG_KVM_MMIO
				3367	case KVM_CAP_COALESCED_MMIO:
				3368	return KVM_COALESCED_MMIO_PAGE_OFFSET;
				3369	case KVM_CAP_COALESCED_PIO:
				3370	return 1;
				3371	#endif
				3372	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
				3373	case KVM_CAP_IRQ_ROUTING:
				3374	return KVM_MAX_IRQ_ROUTES;
				3375	#endif
				3376	#if KVM_ADDRESS_SPACE_NUM > 1
				3377	case KVM_CAP_MULTI_ADDRESS_SPACE:
				3378	return KVM_ADDRESS_SPACE_NUM;
				3379	#endif
				3380	case KVM_CAP_NR_MEMSLOTS:
				3381	return KVM_USER_MEM_SLOTS;
				3382	default:
				3383	break;
				3384	}
				3385	return kvm_vm_ioctl_check_extension(kvm, arg);
				3386	}
				3387
				3388	int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
				3389	struct kvm_enable_cap *cap)
				3390	{
				3391	return -EINVAL;
				3392	}
				3393
				3394	static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
				3395	struct kvm_enable_cap *cap)
				3396	{
				3397	switch (cap->cap) {
				3398	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				3399	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
				3400	if (cap->flags \|\| (cap->args[0] & ~1))
				3401	return -EINVAL;
				3402	kvm->manual_dirty_log_protect = cap->args[0];
				3403	return 0;
				3404	#endif
				3405	default:
				3406	return kvm_vm_ioctl_enable_cap(kvm, cap);
				3407	}
				3408	}
				3409
				3410	static long kvm_vm_ioctl(struct file *filp,
				3411	unsigned int ioctl, unsigned long arg)
				3412	{
				3413	struct kvm *kvm = filp->private_data;
				3414	void __user argp = (void __user )arg;
				3415	int r;
				3416
				3417	if (kvm->mm != current->mm \|\| kvm->vm_bugged)
				3418	return -EIO;
				3419	switch (ioctl) {
				3420	case KVM_CREATE_VCPU:
				3421	r = kvm_vm_ioctl_create_vcpu(kvm, arg);
				3422	break;
				3423	case KVM_ENABLE_CAP: {
				3424	struct kvm_enable_cap cap;
				3425
				3426	r = -EFAULT;
				3427	if (copy_from_user(&cap, argp, sizeof(cap)))
				3428	goto out;
				3429	r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
				3430	break;
				3431	}
				3432	case KVM_SET_USER_MEMORY_REGION: {
				3433	struct kvm_userspace_memory_region kvm_userspace_mem;
				3434
				3435	r = -EFAULT;
				3436	if (copy_from_user(&kvm_userspace_mem, argp,
				3437	sizeof(kvm_userspace_mem)))
				3438	goto out;
				3439
				3440	r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
				3441	break;
				3442	}
				3443	case KVM_GET_DIRTY_LOG: {
				3444	struct kvm_dirty_log log;
				3445
				3446	r = -EFAULT;
				3447	if (copy_from_user(&log, argp, sizeof(log)))
				3448	goto out;
				3449	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
				3450	break;
				3451	}
				3452	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				3453	case KVM_CLEAR_DIRTY_LOG: {
				3454	struct kvm_clear_dirty_log log;
				3455
				3456	r = -EFAULT;
				3457	if (copy_from_user(&log, argp, sizeof(log)))
				3458	goto out;
				3459	r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
				3460	break;
				3461	}
				3462	#endif
				3463	#ifdef CONFIG_KVM_MMIO
				3464	case KVM_REGISTER_COALESCED_MMIO: {
				3465	struct kvm_coalesced_mmio_zone zone;
				3466
				3467	r = -EFAULT;
				3468	if (copy_from_user(&zone, argp, sizeof(zone)))
				3469	goto out;
				3470	r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
				3471	break;
				3472	}
				3473	case KVM_UNREGISTER_COALESCED_MMIO: {
				3474	struct kvm_coalesced_mmio_zone zone;
				3475
				3476	r = -EFAULT;
				3477	if (copy_from_user(&zone, argp, sizeof(zone)))
				3478	goto out;
				3479	r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
				3480	break;
				3481	}
				3482	#endif
				3483	case KVM_IRQFD: {
				3484	struct kvm_irqfd data;
				3485
				3486	r = -EFAULT;
				3487	if (copy_from_user(&data, argp, sizeof(data)))
				3488	goto out;
				3489	r = kvm_irqfd(kvm, &data);
				3490	break;
				3491	}
				3492	case KVM_IOEVENTFD: {
				3493	struct kvm_ioeventfd data;
				3494
				3495	r = -EFAULT;
				3496	if (copy_from_user(&data, argp, sizeof(data)))
				3497	goto out;
				3498	r = kvm_ioeventfd(kvm, &data);
				3499	break;
				3500	}
				3501	#ifdef CONFIG_HAVE_KVM_MSI
				3502	case KVM_SIGNAL_MSI: {
				3503	struct kvm_msi msi;
				3504
				3505	r = -EFAULT;
				3506	if (copy_from_user(&msi, argp, sizeof(msi)))
				3507	goto out;
				3508	r = kvm_send_userspace_msi(kvm, &msi);
				3509	break;
				3510	}
				3511	#endif
				3512	#ifdef __KVM_HAVE_IRQ_LINE
				3513	case KVM_IRQ_LINE_STATUS:
				3514	case KVM_IRQ_LINE: {
				3515	struct kvm_irq_level irq_event;
				3516
				3517	r = -EFAULT;
				3518	if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
				3519	goto out;
				3520
				3521	r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
				3522	ioctl == KVM_IRQ_LINE_STATUS);
				3523	if (r)
				3524	goto out;
				3525
				3526	r = -EFAULT;
				3527	if (ioctl == KVM_IRQ_LINE_STATUS) {
				3528	if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
				3529	goto out;
				3530	}
				3531
				3532	r = 0;
				3533	break;
				3534	}
				3535	#endif
				3536	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
				3537	case KVM_SET_GSI_ROUTING: {
				3538	struct kvm_irq_routing routing;
				3539	struct kvm_irq_routing __user *urouting;
				3540	struct kvm_irq_routing_entry *entries = NULL;
				3541
				3542	r = -EFAULT;
				3543	if (copy_from_user(&routing, argp, sizeof(routing)))
				3544	goto out;
				3545	r = -EINVAL;
				3546	if (!kvm_arch_can_set_irq_routing(kvm))
				3547	goto out;
				3548	if (routing.nr > KVM_MAX_IRQ_ROUTES)
				3549	goto out;
				3550	if (routing.flags)
				3551	goto out;
				3552	if (routing.nr) {
				3553	r = -ENOMEM;
				3554	entries = vmalloc(array_size(sizeof(*entries),
				3555	routing.nr));
				3556	if (!entries)
				3557	goto out;
				3558	r = -EFAULT;
				3559	urouting = argp;
				3560	if (copy_from_user(entries, urouting->entries,
				3561	routing.nr * sizeof(*entries)))
				3562	goto out_free_irq_routing;
				3563	}
				3564	r = kvm_set_irq_routing(kvm, entries, routing.nr,
				3565	routing.flags);
				3566	out_free_irq_routing:
				3567	vfree(entries);
				3568	break;
				3569	}
				3570	#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
				3571	case KVM_CREATE_DEVICE: {
				3572	struct kvm_create_device cd;
				3573
				3574	r = -EFAULT;
				3575	if (copy_from_user(&cd, argp, sizeof(cd)))
				3576	goto out;
				3577
				3578	r = kvm_ioctl_create_device(kvm, &cd);
				3579	if (r)
				3580	goto out;
				3581
				3582	r = -EFAULT;
				3583	if (copy_to_user(argp, &cd, sizeof(cd)))
				3584	goto out;
				3585
				3586	r = 0;
				3587	break;
				3588	}
				3589	case KVM_CHECK_EXTENSION:
				3590	r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
				3591	break;
				3592	default:
				3593	r = kvm_arch_vm_ioctl(filp, ioctl, arg);
				3594	}
				3595	out:
				3596	return r;
				3597	}
				3598
				3599	#ifdef CONFIG_KVM_COMPAT
				3600	struct compat_kvm_dirty_log {
				3601	__u32 slot;
				3602	__u32 padding1;
				3603	union {
				3604	compat_uptr_t dirty_bitmap; /* one bit per page */
				3605	__u64 padding2;
				3606	};
				3607	};
				3608
				3609	struct compat_kvm_clear_dirty_log {
				3610	__u32 slot;
				3611	__u32 num_pages;
				3612	__u64 first_page;
				3613	union {
				3614	compat_uptr_t dirty_bitmap; /* one bit per page */
				3615	__u64 padding2;
				3616	};
				3617	};
				3618
				3619	static long kvm_vm_compat_ioctl(struct file *filp,
				3620	unsigned int ioctl, unsigned long arg)
				3621	{
				3622	struct kvm *kvm = filp->private_data;
				3623	int r;
				3624
				3625	if (kvm->mm != current->mm \|\| kvm->vm_bugged)
				3626	return -EIO;
				3627	switch (ioctl) {
				3628	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				3629	case KVM_CLEAR_DIRTY_LOG: {
				3630	struct compat_kvm_clear_dirty_log compat_log;
				3631	struct kvm_clear_dirty_log log;
				3632
				3633	if (copy_from_user(&compat_log, (void __user *)arg,
				3634	sizeof(compat_log)))
				3635	return -EFAULT;
				3636	log.slot = compat_log.slot;
				3637	log.num_pages = compat_log.num_pages;
				3638	log.first_page = compat_log.first_page;
				3639	log.padding2 = compat_log.padding2;
				3640	log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
				3641
				3642	r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
				3643	break;
				3644	}
				3645	#endif
				3646	case KVM_GET_DIRTY_LOG: {
				3647	struct compat_kvm_dirty_log compat_log;
				3648	struct kvm_dirty_log log;
				3649
				3650	if (copy_from_user(&compat_log, (void __user *)arg,
				3651	sizeof(compat_log)))
				3652	return -EFAULT;
				3653	log.slot = compat_log.slot;
				3654	log.padding1 = compat_log.padding1;
				3655	log.padding2 = compat_log.padding2;
				3656	log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
				3657
				3658	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
				3659	break;
				3660	}
				3661	default:
				3662	r = kvm_vm_ioctl(filp, ioctl, arg);
				3663	}
				3664	return r;
				3665	}
				3666	#endif
				3667
				3668	static struct file_operations kvm_vm_fops = {
				3669	.release = kvm_vm_release,
				3670	.unlocked_ioctl = kvm_vm_ioctl,
				3671	.llseek = noop_llseek,
				3672	KVM_COMPAT(kvm_vm_compat_ioctl),
				3673	};
				3674
				3675	static int kvm_dev_ioctl_create_vm(unsigned long type)
				3676	{
				3677	int r;
				3678	struct kvm *kvm;
				3679	struct file *file;
				3680
				3681	kvm = kvm_create_vm(type);
				3682	if (IS_ERR(kvm))
				3683	return PTR_ERR(kvm);
				3684	#ifdef CONFIG_KVM_MMIO
				3685	r = kvm_coalesced_mmio_init(kvm);
				3686	if (r < 0)
				3687	goto put_kvm;
				3688	#endif
				3689	r = get_unused_fd_flags(O_CLOEXEC);
				3690	if (r < 0)
				3691	goto put_kvm;
				3692
				3693	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
				3694	if (IS_ERR(file)) {
				3695	put_unused_fd(r);
				3696	r = PTR_ERR(file);
				3697	goto put_kvm;
				3698	}
				3699
				3700	/*
				3701	* Don't call kvm_put_kvm anymore at this point; file->f_op is
				3702	* already set, with ->release() being kvm_vm_release(). In error
				3703	* cases it will be called by the final fput(file) and will take
				3704	* care of doing kvm_put_kvm(kvm).
				3705	*/
				3706	if (kvm_create_vm_debugfs(kvm, r) < 0) {
				3707	put_unused_fd(r);
				3708	fput(file);
				3709	return -ENOMEM;
				3710	}
				3711	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
				3712
				3713	fd_install(r, file);
				3714	return r;
				3715
				3716	put_kvm:
				3717	kvm_put_kvm(kvm);
				3718	return r;
				3719	}
				3720
				3721	static long kvm_dev_ioctl(struct file *filp,
				3722	unsigned int ioctl, unsigned long arg)
				3723	{
				3724	long r = -EINVAL;
				3725
				3726	switch (ioctl) {
				3727	case KVM_GET_API_VERSION:
				3728	if (arg)
				3729	goto out;
				3730	r = KVM_API_VERSION;
				3731	break;
				3732	case KVM_CREATE_VM:
				3733	r = kvm_dev_ioctl_create_vm(arg);
				3734	break;
				3735	case KVM_CHECK_EXTENSION:
				3736	r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
				3737	break;
				3738	case KVM_GET_VCPU_MMAP_SIZE:
				3739	if (arg)
				3740	goto out;
				3741	r = PAGE_SIZE; /* struct kvm_run */
				3742	#ifdef CONFIG_X86
				3743	r += PAGE_SIZE; /* pio data page */
				3744	#endif
				3745	#ifdef CONFIG_KVM_MMIO
				3746	r += PAGE_SIZE; /* coalesced mmio ring page */
				3747	#endif
				3748	break;
				3749	case KVM_TRACE_ENABLE:
				3750	case KVM_TRACE_PAUSE:
				3751	case KVM_TRACE_DISABLE:
				3752	r = -EOPNOTSUPP;
				3753	break;
				3754	default:
				3755	return kvm_arch_dev_ioctl(filp, ioctl, arg);
				3756	}
				3757	out:
				3758	return r;
				3759	}
				3760
				3761	static struct file_operations kvm_chardev_ops = {
				3762	.unlocked_ioctl = kvm_dev_ioctl,
				3763	.llseek = noop_llseek,
				3764	KVM_COMPAT(kvm_dev_ioctl),
				3765	};
				3766
				3767	static struct miscdevice kvm_dev = {
				3768	KVM_MINOR,
				3769	"kvm",
				3770	&kvm_chardev_ops,
				3771	};
				3772
				3773	static void hardware_enable_nolock(void *junk)
				3774	{
				3775	int cpu = raw_smp_processor_id();
				3776	int r;
				3777
				3778	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
				3779	return;
				3780
				3781	cpumask_set_cpu(cpu, cpus_hardware_enabled);
				3782
				3783	r = kvm_arch_hardware_enable();
				3784
				3785	if (r) {
				3786	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
				3787	atomic_inc(&hardware_enable_failed);
				3788	pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
				3789	}
				3790	}
				3791
				3792	static int kvm_starting_cpu(unsigned int cpu)
				3793	{
				3794	raw_spin_lock(&kvm_count_lock);
				3795	if (kvm_usage_count)
				3796	hardware_enable_nolock(NULL);
				3797	raw_spin_unlock(&kvm_count_lock);
				3798	return 0;
				3799	}
				3800
				3801	static void hardware_disable_nolock(void *junk)
				3802	{
				3803	int cpu = raw_smp_processor_id();
				3804
				3805	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
				3806	return;
				3807	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
				3808	kvm_arch_hardware_disable();
				3809	}
				3810
				3811	static int kvm_dying_cpu(unsigned int cpu)
				3812	{
				3813	raw_spin_lock(&kvm_count_lock);
				3814	if (kvm_usage_count)
				3815	hardware_disable_nolock(NULL);
				3816	raw_spin_unlock(&kvm_count_lock);
				3817	return 0;
				3818	}
				3819
				3820	static void hardware_disable_all_nolock(void)
				3821	{
				3822	BUG_ON(!kvm_usage_count);
				3823
				3824	kvm_usage_count--;
				3825	if (!kvm_usage_count)
				3826	on_each_cpu(hardware_disable_nolock, NULL, 1);
				3827	}
				3828
				3829	static void hardware_disable_all(void)
				3830	{
				3831	raw_spin_lock(&kvm_count_lock);
				3832	hardware_disable_all_nolock();
				3833	raw_spin_unlock(&kvm_count_lock);
				3834	}
				3835
				3836	static int hardware_enable_all(void)
				3837	{
				3838	int r = 0;
				3839
				3840	raw_spin_lock(&kvm_count_lock);
				3841
				3842	kvm_usage_count++;
				3843	if (kvm_usage_count == 1) {
				3844	atomic_set(&hardware_enable_failed, 0);
				3845	on_each_cpu(hardware_enable_nolock, NULL, 1);
				3846
				3847	if (atomic_read(&hardware_enable_failed)) {
				3848	hardware_disable_all_nolock();
				3849	r = -EBUSY;
				3850	}
				3851	}
				3852
				3853	raw_spin_unlock(&kvm_count_lock);
				3854
				3855	return r;
				3856	}
				3857
				3858	static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
				3859	void *v)
				3860	{
				3861	/*
				3862	* Some (well, at least mine) BIOSes hang on reboot if
				3863	* in vmx root mode.
				3864	*
				3865	* And Intel TXT required VMX off for all cpu when system shutdown.
				3866	*/
				3867	pr_info("kvm: exiting hardware virtualization\n");
				3868	kvm_rebooting = true;
				3869	on_each_cpu(hardware_disable_nolock, NULL, 1);
				3870	return NOTIFY_OK;
				3871	}
				3872
				3873	static struct notifier_block kvm_reboot_notifier = {
				3874	.notifier_call = kvm_reboot,
				3875	.priority = 0,
				3876	};
				3877
				3878	static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
				3879	{
				3880	int i;
				3881
				3882	for (i = 0; i < bus->dev_count; i++) {
				3883	struct kvm_io_device *pos = bus->range[i].dev;
				3884
				3885	kvm_iodevice_destructor(pos);
				3886	}
				3887	kfree(bus);
				3888	}
				3889
				3890	static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
				3891	const struct kvm_io_range *r2)
				3892	{
				3893	gpa_t addr1 = r1->addr;
				3894	gpa_t addr2 = r2->addr;
				3895
				3896	if (addr1 < addr2)
				3897	return -1;
				3898
				3899	/* If r2->len == 0, match the exact address. If r2->len != 0,
				3900	* accept any overlapping write. Any order is acceptable for
				3901	* overlapping ranges, because kvm_io_bus_get_first_dev ensures
				3902	* we process all of them.
				3903	*/
				3904	if (r2->len) {
				3905	addr1 += r1->len;
				3906	addr2 += r2->len;
				3907	}
				3908
				3909	if (addr1 > addr2)
				3910	return 1;
				3911
				3912	return 0;
				3913	}
				3914
				3915	static int kvm_io_bus_sort_cmp(const void p1, const void p2)
				3916	{
				3917	return kvm_io_bus_cmp(p1, p2);
				3918	}
				3919
				3920	static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
				3921	gpa_t addr, int len)
				3922	{
				3923	struct kvm_io_range *range, key;
				3924	int off;
				3925
				3926	key = (struct kvm_io_range) {
				3927	.addr = addr,
				3928	.len = len,
				3929	};
				3930
				3931	range = bsearch(&key, bus->range, bus->dev_count,
				3932	sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
				3933	if (range == NULL)
				3934	return -ENOENT;
				3935
				3936	off = range - bus->range;
				3937
				3938	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
				3939	off--;
				3940
				3941	return off;
				3942	}
				3943
				3944	static int __kvm_io_bus_write(struct kvm_vcpu vcpu, struct kvm_io_bus bus,
				3945	struct kvm_io_range range, const void val)
				3946	{
				3947	int idx;
				3948
				3949	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
				3950	if (idx < 0)
				3951	return -EOPNOTSUPP;
				3952
				3953	while (idx < bus->dev_count &&
				3954	kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
				3955	if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
				3956	range->len, val))
				3957	return idx;
				3958	idx++;
				3959	}
				3960
				3961	return -EOPNOTSUPP;
				3962	}
				3963
				3964	/* kvm_io_bus_write - called under kvm->slots_lock */
				3965	int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
				3966	int len, const void *val)
				3967	{
				3968	struct kvm_io_bus *bus;
				3969	struct kvm_io_range range;
				3970	int r;
				3971
				3972	range = (struct kvm_io_range) {
				3973	.addr = addr,
				3974	.len = len,
				3975	};
				3976
				3977	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3978	if (!bus)
				3979	return -ENOMEM;
				3980	r = __kvm_io_bus_write(vcpu, bus, &range, val);
				3981	return r < 0 ? r : 0;
				3982	}
				3983	EXPORT_SYMBOL_GPL(kvm_io_bus_write);
				3984
				3985	/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
				3986	int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
				3987	gpa_t addr, int len, const void *val, long cookie)
				3988	{
				3989	struct kvm_io_bus *bus;
				3990	struct kvm_io_range range;
				3991
				3992	range = (struct kvm_io_range) {
				3993	.addr = addr,
				3994	.len = len,
				3995	};
				3996
				3997	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3998	if (!bus)
				3999	return -ENOMEM;
				4000
				4001	/* First try the device referenced by cookie. */
				4002	if ((cookie >= 0) && (cookie < bus->dev_count) &&
				4003	(kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
				4004	if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
				4005	val))
				4006	return cookie;
				4007
				4008	/*
				4009	* cookie contained garbage; fall back to search and return the
				4010	* correct cookie value.
				4011	*/
				4012	return __kvm_io_bus_write(vcpu, bus, &range, val);
				4013	}
				4014
				4015	static int __kvm_io_bus_read(struct kvm_vcpu vcpu, struct kvm_io_bus bus,
				4016	struct kvm_io_range range, void val)
				4017	{
				4018	int idx;
				4019
				4020	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
				4021	if (idx < 0)
				4022	return -EOPNOTSUPP;
				4023
				4024	while (idx < bus->dev_count &&
				4025	kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
				4026	if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
				4027	range->len, val))
				4028	return idx;
				4029	idx++;
				4030	}
				4031
				4032	return -EOPNOTSUPP;
				4033	}
				4034
				4035	/* kvm_io_bus_read - called under kvm->slots_lock */
				4036	int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
				4037	int len, void *val)
				4038	{
				4039	struct kvm_io_bus *bus;
				4040	struct kvm_io_range range;
				4041	int r;
				4042
				4043	range = (struct kvm_io_range) {
				4044	.addr = addr,
				4045	.len = len,
				4046	};
				4047
				4048	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				4049	if (!bus)
				4050	return -ENOMEM;
				4051	r = __kvm_io_bus_read(vcpu, bus, &range, val);
				4052	return r < 0 ? r : 0;
				4053	}
				4054
				4055	/* Caller must hold slots_lock. */
				4056	int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
				4057	int len, struct kvm_io_device *dev)
				4058	{
				4059	int i;
				4060	struct kvm_io_bus new_bus, bus;
				4061	struct kvm_io_range range;
				4062
				4063	bus = kvm_get_bus(kvm, bus_idx);
				4064	if (!bus)
				4065	return -ENOMEM;
				4066
				4067	/* exclude ioeventfd which is limited by maximum fd */
				4068	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
				4069	return -ENOSPC;
				4070
				4071	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
				4072	GFP_KERNEL_ACCOUNT);
				4073	if (!new_bus)
				4074	return -ENOMEM;
				4075
				4076	range = (struct kvm_io_range) {
				4077	.addr = addr,
				4078	.len = len,
				4079	.dev = dev,
				4080	};
				4081
				4082	for (i = 0; i < bus->dev_count; i++)
				4083	if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
				4084	break;
				4085
				4086	memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
				4087	new_bus->dev_count++;
				4088	new_bus->range[i] = range;
				4089	memcpy(new_bus->range + i + 1, bus->range + i,
				4090	(bus->dev_count - i) * sizeof(struct kvm_io_range));
				4091	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
				4092	synchronize_srcu_expedited(&kvm->srcu);
				4093	kfree(bus);
				4094
				4095	return 0;
				4096	}
				4097
				4098	/* Caller must hold slots_lock. */
				4099	int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
				4100	struct kvm_io_device *dev)
				4101	{
				4102	int i, j;
				4103	struct kvm_io_bus new_bus, bus;
				4104
				4105	bus = kvm_get_bus(kvm, bus_idx);
				4106	if (!bus)
				4107	return 0;
				4108
				4109	for (i = 0; i < bus->dev_count; i++)
				4110	if (bus->range[i].dev == dev) {
				4111	break;
				4112	}
				4113
				4114	if (i == bus->dev_count)
				4115	return 0;
				4116
				4117	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
				4118	GFP_KERNEL_ACCOUNT);
				4119	if (new_bus) {
				4120	memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
				4121	new_bus->dev_count--;
				4122	memcpy(new_bus->range + i, bus->range + i + 1,
				4123	(new_bus->dev_count - i) * sizeof(struct kvm_io_range));
				4124	} else {
				4125	pr_err("kvm: failed to shrink bus, removing it completely\n");
				4126	for (j = 0; j < bus->dev_count; j++) {
				4127	if (j == i)
				4128	continue;
				4129	kvm_iodevice_destructor(bus->range[j].dev);
				4130	}
				4131	}
				4132
				4133	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
				4134	synchronize_srcu_expedited(&kvm->srcu);
				4135	kfree(bus);
				4136	return new_bus ? 0 : -ENOMEM;
				4137	}
				4138
				4139	struct kvm_io_device kvm_io_bus_get_dev(struct kvm kvm, enum kvm_bus bus_idx,
				4140	gpa_t addr)
				4141	{
				4142	struct kvm_io_bus *bus;
				4143	int dev_idx, srcu_idx;
				4144	struct kvm_io_device *iodev = NULL;
				4145
				4146	srcu_idx = srcu_read_lock(&kvm->srcu);
				4147
				4148	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
				4149	if (!bus)
				4150	goto out_unlock;
				4151
				4152	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
				4153	if (dev_idx < 0)
				4154	goto out_unlock;
				4155
				4156	iodev = bus->range[dev_idx].dev;
				4157
				4158	out_unlock:
				4159	srcu_read_unlock(&kvm->srcu, srcu_idx);
				4160
				4161	return iodev;
				4162	}
				4163	EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
				4164
				4165	static int kvm_debugfs_open(struct inode inode, struct file file,
				4166	int (get)(void , u64 ), int (set)(void *, u64),
				4167	const char *fmt)
				4168	{
				4169	struct kvm_stat_data stat_data = (struct kvm_stat_data )
				4170	inode->i_private;
				4171
				4172	/* The debugfs files are a reference to the kvm struct which
				4173	* is still valid when kvm_destroy_vm is called.
				4174	* To avoid the race between open and the removal of the debugfs
				4175	* directory we test against the users count.
				4176	*/
				4177	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
				4178	return -ENOENT;
				4179
				4180	if (simple_attr_open(inode, file, get,
				4181	stat_data->mode & S_IWUGO ? set : NULL,
				4182	fmt)) {
				4183	kvm_put_kvm(stat_data->kvm);
				4184	return -ENOMEM;
				4185	}
				4186
				4187	return 0;
				4188	}
				4189
				4190	static int kvm_debugfs_release(struct inode inode, struct file file)
				4191	{
				4192	struct kvm_stat_data stat_data = (struct kvm_stat_data )
				4193	inode->i_private;
				4194
				4195	simple_attr_release(inode, file);
				4196	kvm_put_kvm(stat_data->kvm);
				4197
				4198	return 0;
				4199	}
				4200
				4201	static int vm_stat_get_per_vm(void data, u64 val)
				4202	{
				4203	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				4204
				4205	val = (ulong )((void )stat_data->kvm + stat_data->offset);
				4206
				4207	return 0;
				4208	}
				4209
				4210	static int vm_stat_clear_per_vm(void *data, u64 val)
				4211	{
				4212	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				4213
				4214	if (val)
				4215	return -EINVAL;
				4216
				4217	(ulong )((void *)stat_data->kvm + stat_data->offset) = 0;
				4218
				4219	return 0;
				4220	}
				4221
				4222	static int vm_stat_get_per_vm_open(struct inode inode, struct file file)
				4223	{
				4224	__simple_attr_check_format("%llu\n", 0ull);
				4225	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
				4226	vm_stat_clear_per_vm, "%llu\n");
				4227	}
				4228
				4229	static const struct file_operations vm_stat_get_per_vm_fops = {
				4230	.owner = THIS_MODULE,
				4231	.open = vm_stat_get_per_vm_open,
				4232	.release = kvm_debugfs_release,
				4233	.read = simple_attr_read,
				4234	.write = simple_attr_write,
				4235	.llseek = no_llseek,
				4236	};
				4237
				4238	static int vcpu_stat_get_per_vm(void data, u64 val)
				4239	{
				4240	int i;
				4241	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				4242	struct kvm_vcpu *vcpu;
				4243
				4244	*val = 0;
				4245
				4246	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
				4247	val += (u64 )((void )vcpu + stat_data->offset);
				4248
				4249	return 0;
				4250	}
				4251
				4252	static int vcpu_stat_clear_per_vm(void *data, u64 val)
				4253	{
				4254	int i;
				4255	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				4256	struct kvm_vcpu *vcpu;
				4257
				4258	if (val)
				4259	return -EINVAL;
				4260
				4261	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
				4262	(u64 )((void *)vcpu + stat_data->offset) = 0;
				4263
				4264	return 0;
				4265	}
				4266
				4267	static int vcpu_stat_get_per_vm_open(struct inode inode, struct file file)
				4268	{
				4269	__simple_attr_check_format("%llu\n", 0ull);
				4270	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
				4271	vcpu_stat_clear_per_vm, "%llu\n");
				4272	}
				4273
				4274	static const struct file_operations vcpu_stat_get_per_vm_fops = {
				4275	.owner = THIS_MODULE,
				4276	.open = vcpu_stat_get_per_vm_open,
				4277	.release = kvm_debugfs_release,
				4278	.read = simple_attr_read,
				4279	.write = simple_attr_write,
				4280	.llseek = no_llseek,
				4281	};
				4282
				4283	static const struct file_operations *stat_fops_per_vm[] = {
				4284	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
				4285	[KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
				4286	};
				4287
				4288	static int vm_stat_get(void _offset, u64 val)
				4289	{
				4290	unsigned offset = (long)_offset;
				4291	struct kvm *kvm;
				4292	struct kvm_stat_data stat_tmp = {.offset = offset};
				4293	u64 tmp_val;
				4294
				4295	*val = 0;
				4296	mutex_lock(&kvm_lock);
				4297	list_for_each_entry(kvm, &vm_list, vm_list) {
				4298	stat_tmp.kvm = kvm;
				4299	vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
				4300	*val += tmp_val;
				4301	}
				4302	mutex_unlock(&kvm_lock);
				4303	return 0;
				4304	}
				4305
				4306	static int vm_stat_clear(void *_offset, u64 val)
				4307	{
				4308	unsigned offset = (long)_offset;
				4309	struct kvm *kvm;
				4310	struct kvm_stat_data stat_tmp = {.offset = offset};
				4311
				4312	if (val)
				4313	return -EINVAL;
				4314
				4315	mutex_lock(&kvm_lock);
				4316	list_for_each_entry(kvm, &vm_list, vm_list) {
				4317	stat_tmp.kvm = kvm;
				4318	vm_stat_clear_per_vm((void *)&stat_tmp, 0);
				4319	}
				4320	mutex_unlock(&kvm_lock);
				4321
				4322	return 0;
				4323	}
				4324
				4325	DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
				4326
				4327	static int vcpu_stat_get(void _offset, u64 val)
				4328	{
				4329	unsigned offset = (long)_offset;
				4330	struct kvm *kvm;
				4331	struct kvm_stat_data stat_tmp = {.offset = offset};
				4332	u64 tmp_val;
				4333
				4334	*val = 0;
				4335	mutex_lock(&kvm_lock);
				4336	list_for_each_entry(kvm, &vm_list, vm_list) {
				4337	stat_tmp.kvm = kvm;
				4338	vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
				4339	*val += tmp_val;
				4340	}
				4341	mutex_unlock(&kvm_lock);
				4342	return 0;
				4343	}
				4344
				4345	static int vcpu_stat_clear(void *_offset, u64 val)
				4346	{
				4347	unsigned offset = (long)_offset;
				4348	struct kvm *kvm;
				4349	struct kvm_stat_data stat_tmp = {.offset = offset};
				4350
				4351	if (val)
				4352	return -EINVAL;
				4353
				4354	mutex_lock(&kvm_lock);
				4355	list_for_each_entry(kvm, &vm_list, vm_list) {
				4356	stat_tmp.kvm = kvm;
				4357	vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
				4358	}
				4359	mutex_unlock(&kvm_lock);
				4360
				4361	return 0;
				4362	}
				4363
				4364	DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
				4365	"%llu\n");
				4366
				4367	static const struct file_operations *stat_fops[] = {
				4368	[KVM_STAT_VCPU] = &vcpu_stat_fops,
				4369	[KVM_STAT_VM] = &vm_stat_fops,
				4370	};
				4371
				4372	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
				4373	{
				4374	struct kobj_uevent_env *env;
				4375	unsigned long long created, active;
				4376
				4377	if (!kvm_dev.this_device \|\| !kvm)
				4378	return;
				4379
				4380	mutex_lock(&kvm_lock);
				4381	if (type == KVM_EVENT_CREATE_VM) {
				4382	kvm_createvm_count++;
				4383	kvm_active_vms++;
				4384	} else if (type == KVM_EVENT_DESTROY_VM) {
				4385	kvm_active_vms--;
				4386	}
				4387	created = kvm_createvm_count;
				4388	active = kvm_active_vms;
				4389	mutex_unlock(&kvm_lock);
				4390
				4391	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
				4392	if (!env)
				4393	return;
				4394
				4395	add_uevent_var(env, "CREATED=%llu", created);
				4396	add_uevent_var(env, "COUNT=%llu", active);
				4397
				4398	if (type == KVM_EVENT_CREATE_VM) {
				4399	add_uevent_var(env, "EVENT=create");
				4400	kvm->userspace_pid = task_pid_nr(current);
				4401	} else if (type == KVM_EVENT_DESTROY_VM) {
				4402	add_uevent_var(env, "EVENT=destroy");
				4403	}
				4404	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
				4405
				4406	if (kvm->debugfs_dentry) {
				4407	char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
				4408
				4409	if (p) {
				4410	tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
				4411	if (!IS_ERR(tmp))
				4412	add_uevent_var(env, "STATS_PATH=%s", tmp);
				4413	kfree(p);
				4414	}
				4415	}
				4416	/* no need for checks, since we are adding at most only 5 keys */
				4417	env->envp[env->envp_idx++] = NULL;
				4418	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
				4419	kfree(env);
				4420	}
				4421
				4422	static void kvm_init_debug(void)
				4423	{
				4424	struct kvm_stats_debugfs_item *p;
				4425
				4426	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
				4427
				4428	kvm_debugfs_num_entries = 0;
				4429	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
				4430	int mode = p->mode ? p->mode : 0644;
				4431	debugfs_create_file(p->name, mode, kvm_debugfs_dir,
				4432	(void *)(long)p->offset,
				4433	stat_fops[p->kind]);
				4434	}
				4435	}
				4436
				4437	static int kvm_suspend(void)
				4438	{
				4439	if (kvm_usage_count)
				4440	hardware_disable_nolock(NULL);
				4441	return 0;
				4442	}
				4443
				4444	static void kvm_resume(void)
				4445	{
				4446	if (kvm_usage_count) {
				4447	#ifdef CONFIG_LOCKDEP
				4448	WARN_ON(lockdep_is_held(&kvm_count_lock));
				4449	#endif
				4450	hardware_enable_nolock(NULL);
				4451	}
				4452	}
				4453
				4454	static struct syscore_ops kvm_syscore_ops = {
				4455	.suspend = kvm_suspend,
				4456	.resume = kvm_resume,
				4457	};
				4458
				4459	static inline
				4460	struct kvm_vcpu preempt_notifier_to_vcpu(struct preempt_notifier pn)
				4461	{
				4462	return container_of(pn, struct kvm_vcpu, preempt_notifier);
				4463	}
				4464
				4465	static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
				4466	{
				4467	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
				4468
				4469	WRITE_ONCE(vcpu->preempted, false);
				4470	WRITE_ONCE(vcpu->ready, false);
				4471
				4472	kvm_arch_sched_in(vcpu, cpu);
				4473
				4474	kvm_arch_vcpu_load(vcpu, cpu);
				4475	}
				4476
				4477	static void kvm_sched_out(struct preempt_notifier *pn,
				4478	struct task_struct *next)
				4479	{
				4480	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
				4481
				4482	if (current->state == TASK_RUNNING) {
				4483	WRITE_ONCE(vcpu->preempted, true);
				4484	WRITE_ONCE(vcpu->ready, true);
				4485	}
				4486	kvm_arch_vcpu_put(vcpu);
				4487	}
				4488
				4489	static void check_processor_compat(void *rtn)
				4490	{
				4491	(int )rtn = kvm_arch_check_processor_compat();
				4492	}
				4493
				4494	int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
				4495	struct module *module)
				4496	{
				4497	int r;
				4498	int cpu;
				4499
				4500	r = kvm_arch_init(opaque);
				4501	if (r)
				4502	goto out_fail;
				4503
				4504	/*
				4505	* kvm_arch_init makes sure there's at most one caller
				4506	* for architectures that support multiple implementations,
				4507	* like intel and amd on x86.
				4508	* kvm_arch_init must be called before kvm_irqfd_init to avoid creating
				4509	* conflicts in case kvm is already setup for another implementation.
				4510	*/
				4511	r = kvm_irqfd_init();
				4512	if (r)
				4513	goto out_irqfd;
				4514
				4515	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
				4516	r = -ENOMEM;
				4517	goto out_free_0;
				4518	}
				4519
				4520	r = kvm_arch_hardware_setup();
				4521	if (r < 0)
				4522	goto out_free_0a;
				4523
				4524	for_each_online_cpu(cpu) {
				4525	smp_call_function_single(cpu, check_processor_compat, &r, 1);
				4526	if (r < 0)
				4527	goto out_free_1;
				4528	}
				4529
				4530	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
				4531	kvm_starting_cpu, kvm_dying_cpu);
				4532	if (r)
				4533	goto out_free_2;
				4534	register_reboot_notifier(&kvm_reboot_notifier);
				4535
				4536	/* A kmem cache lets us meet the alignment requirements of fx_save. */
				4537	if (!vcpu_align)
				4538	vcpu_align = __alignof__(struct kvm_vcpu);
				4539	kvm_vcpu_cache =
				4540	kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
				4541	SLAB_ACCOUNT,
				4542	offsetof(struct kvm_vcpu, arch),
				4543	sizeof_field(struct kvm_vcpu, arch),
				4544	NULL);
				4545	if (!kvm_vcpu_cache) {
				4546	r = -ENOMEM;
				4547	goto out_free_3;
				4548	}
				4549
				4550	r = kvm_async_pf_init();
				4551	if (r)
				4552	goto out_free;
				4553
				4554	kvm_chardev_ops.owner = module;
				4555	kvm_vm_fops.owner = module;
				4556	kvm_vcpu_fops.owner = module;
				4557
				4558	r = misc_register(&kvm_dev);
				4559	if (r) {
				4560	pr_err("kvm: misc device register failed\n");
				4561	goto out_unreg;
				4562	}
				4563
				4564	register_syscore_ops(&kvm_syscore_ops);
				4565
				4566	kvm_preempt_ops.sched_in = kvm_sched_in;
				4567	kvm_preempt_ops.sched_out = kvm_sched_out;
				4568
				4569	kvm_init_debug();
				4570
				4571	r = kvm_vfio_ops_init();
				4572	WARN_ON(r);
				4573
				4574	return 0;
				4575
				4576	out_unreg:
				4577	kvm_async_pf_deinit();
				4578	out_free:
				4579	kmem_cache_destroy(kvm_vcpu_cache);
				4580	out_free_3:
				4581	unregister_reboot_notifier(&kvm_reboot_notifier);
				4582	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
				4583	out_free_2:
				4584	out_free_1:
				4585	kvm_arch_hardware_unsetup();
				4586	out_free_0a:
				4587	free_cpumask_var(cpus_hardware_enabled);
				4588	out_free_0:
				4589	kvm_irqfd_exit();
				4590	out_irqfd:
				4591	kvm_arch_exit();
				4592	out_fail:
				4593	return r;
				4594	}
				4595	EXPORT_SYMBOL_GPL(kvm_init);
				4596
				4597	void kvm_exit(void)
				4598	{
				4599	debugfs_remove_recursive(kvm_debugfs_dir);
				4600	misc_deregister(&kvm_dev);
				4601	kmem_cache_destroy(kvm_vcpu_cache);
				4602	kvm_async_pf_deinit();
				4603	unregister_syscore_ops(&kvm_syscore_ops);
				4604	unregister_reboot_notifier(&kvm_reboot_notifier);
				4605	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
				4606	on_each_cpu(hardware_disable_nolock, NULL, 1);
				4607	kvm_arch_hardware_unsetup();
				4608	kvm_arch_exit();
				4609	kvm_irqfd_exit();
				4610	free_cpumask_var(cpus_hardware_enabled);
				4611	kvm_vfio_ops_exit();
				4612	}
				4613	EXPORT_SYMBOL_GPL(kvm_exit);
				4614
				4615	struct kvm_vm_worker_thread_context {
				4616	struct kvm *kvm;
				4617	struct task_struct *parent;
				4618	struct completion init_done;
				4619	kvm_vm_thread_fn_t thread_fn;
				4620	uintptr_t data;
				4621	int err;
				4622	};
				4623
				4624	static int kvm_vm_worker_thread(void *context)
				4625	{
				4626	/*
				4627	* The init_context is allocated on the stack of the parent thread, so
				4628	* we have to locally copy anything that is needed beyond initialization
				4629	*/
				4630	struct kvm_vm_worker_thread_context *init_context = context;
				4631	struct kvm *kvm = init_context->kvm;
				4632	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
				4633	uintptr_t data = init_context->data;
				4634	int err;
				4635
				4636	err = kthread_park(current);
				4637	/* kthread_park(current) is never supposed to return an error */
				4638	WARN_ON(err != 0);
				4639	if (err)
				4640	goto init_complete;
				4641
				4642	err = cgroup_attach_task_all(init_context->parent, current);
				4643	if (err) {
				4644	kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
				4645	__func__, err);
				4646	goto init_complete;
				4647	}
				4648
				4649	set_user_nice(current, task_nice(init_context->parent));
				4650
				4651	init_complete:
				4652	init_context->err = err;
				4653	complete(&init_context->init_done);
				4654	init_context = NULL;
				4655
				4656	if (err)
				4657	return err;
				4658
				4659	/* Wait to be woken up by the spawner before proceeding. */
				4660	kthread_parkme();
				4661
				4662	if (!kthread_should_stop())
				4663	err = thread_fn(kvm, data);
				4664
				4665	return err;
				4666	}
				4667
				4668	int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
				4669	uintptr_t data, const char *name,
				4670	struct task_struct **thread_ptr)
				4671	{
				4672	struct kvm_vm_worker_thread_context init_context = {};
				4673	struct task_struct *thread;
				4674
				4675	*thread_ptr = NULL;
				4676	init_context.kvm = kvm;
				4677	init_context.parent = current;
				4678	init_context.thread_fn = thread_fn;
				4679	init_context.data = data;
				4680	init_completion(&init_context.init_done);
				4681
				4682	thread = kthread_run(kvm_vm_worker_thread, &init_context,
				4683	"%s-%d", name, task_pid_nr(current));
				4684	if (IS_ERR(thread))
				4685	return PTR_ERR(thread);
				4686
				4687	/* kthread_run is never supposed to return NULL */
				4688	WARN_ON(thread == NULL);
				4689
				4690	wait_for_completion(&init_context.init_done);
				4691
				4692	if (!init_context.err)
				4693	*thread_ptr = thread;
				4694
				4695	return init_context.err;
				4696	}