Blame - src/kernel/linux/v4.19/virt/kvm/kvm_main.c - T800

blob: 9502b1a44232ca4b7d10a3ee55fdf5a98d045930 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* This module enables machines with Intel VT-x extensions to run virtual
				5	* machines without emulation or binary translation.
				6	*
				7	* Copyright (C) 2006 Qumranet, Inc.
				8	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				9	*
				10	* Authors:
				11	* Avi Kivity <avi@qumranet.com>
				12	* Yaniv Kamay <yaniv@qumranet.com>
				13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2. See
				15	* the COPYING file in the top-level directory.
				16	*
				17	*/
				18
				19	#include <kvm/iodev.h>
				20
				21	#include <linux/kvm_host.h>
				22	#include <linux/kvm.h>
				23	#include <linux/module.h>
				24	#include <linux/errno.h>
				25	#include <linux/percpu.h>
				26	#include <linux/mm.h>
				27	#include <linux/miscdevice.h>
				28	#include <linux/vmalloc.h>
				29	#include <linux/reboot.h>
				30	#include <linux/debugfs.h>
				31	#include <linux/highmem.h>
				32	#include <linux/file.h>
				33	#include <linux/syscore_ops.h>
				34	#include <linux/cpu.h>
				35	#include <linux/sched/signal.h>
				36	#include <linux/sched/mm.h>
				37	#include <linux/sched/stat.h>
				38	#include <linux/cpumask.h>
				39	#include <linux/smp.h>
				40	#include <linux/anon_inodes.h>
				41	#include <linux/profile.h>
				42	#include <linux/kvm_para.h>
				43	#include <linux/pagemap.h>
				44	#include <linux/mman.h>
				45	#include <linux/swap.h>
				46	#include <linux/bitops.h>
				47	#include <linux/spinlock.h>
				48	#include <linux/compat.h>
				49	#include <linux/srcu.h>
				50	#include <linux/hugetlb.h>
				51	#include <linux/slab.h>
				52	#include <linux/sort.h>
				53	#include <linux/bsearch.h>
				54	#include <linux/kthread.h>
				55
				56	#include <asm/processor.h>
				57	#include <asm/io.h>
				58	#include <asm/ioctl.h>
				59	#include <linux/uaccess.h>
				60	#include <asm/pgtable.h>
				61
				62	#include "coalesced_mmio.h"
				63	#include "async_pf.h"
				64	#include "vfio.h"
				65
				66	#define CREATE_TRACE_POINTS
				67	#include <trace/events/kvm.h>
				68
				69	/* Worst case buffer size needed for holding an integer. */
				70	#define ITOA_MAX_LEN 12
				71
				72	MODULE_AUTHOR("Qumranet");
				73	MODULE_LICENSE("GPL");
				74
				75	/* Architectures should define their poll value according to the halt latency */
				76	unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
				77	module_param(halt_poll_ns, uint, 0644);
				78	EXPORT_SYMBOL_GPL(halt_poll_ns);
				79
				80	/* Default doubles per-vcpu halt_poll_ns. */
				81	unsigned int halt_poll_ns_grow = 2;
				82	module_param(halt_poll_ns_grow, uint, 0644);
				83	EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
				84
				85	/* Default resets per-vcpu halt_poll_ns . */
				86	unsigned int halt_poll_ns_shrink;
				87	module_param(halt_poll_ns_shrink, uint, 0644);
				88	EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
				89
				90	/*
				91	* Ordering of locks:
				92	*
				93	* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
				94	*/
				95
				96	DEFINE_MUTEX(kvm_lock);
				97	static DEFINE_RAW_SPINLOCK(kvm_count_lock);
				98	LIST_HEAD(vm_list);
				99
				100	static cpumask_var_t cpus_hardware_enabled;
				101	static int kvm_usage_count;
				102	static atomic_t hardware_enable_failed;
				103
				104	struct kmem_cache *kvm_vcpu_cache;
				105	EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
				106
				107	static __read_mostly struct preempt_ops kvm_preempt_ops;
				108
				109	struct dentry *kvm_debugfs_dir;
				110	EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
				111
				112	static int kvm_debugfs_num_entries;
				113	static const struct file_operations *stat_fops_per_vm[];
				114
				115	static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
				116	unsigned long arg);
				117	#ifdef CONFIG_KVM_COMPAT
				118	static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
				119	unsigned long arg);
				120	#define KVM_COMPAT(c) .compat_ioctl = (c)
				121	#else
				122	static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
				123	unsigned long arg) { return -EINVAL; }
				124	#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl
				125	#endif
				126	static int hardware_enable_all(void);
				127	static void hardware_disable_all(void);
				128
				129	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
				130
				131	static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
				132
				133	__visible bool kvm_rebooting;
				134	EXPORT_SYMBOL_GPL(kvm_rebooting);
				135
				136	static bool largepages_enabled = true;
				137
				138	#define KVM_EVENT_CREATE_VM 0
				139	#define KVM_EVENT_DESTROY_VM 1
				140	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
				141	static unsigned long long kvm_createvm_count;
				142	static unsigned long long kvm_active_vms;
				143
				144	__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
				145	unsigned long start, unsigned long end, bool blockable)
				146	{
				147	return 0;
				148	}
				149
				150	bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
				151	{
				152	/*
				153	* The metadata used by is_zone_device_page() to determine whether or
				154	* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
				155	* the device has been pinned, e.g. by get_user_pages(). WARN if the
				156	* page_count() is zero to help detect bad usage of this helper.
				157	*/
				158	if (!pfn_valid(pfn) \|\| WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
				159	return false;
				160
				161	return is_zone_device_page(pfn_to_page(pfn));
				162	}
				163
				164	bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
				165	{
				166	/*
				167	* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
				168	* perspective they are "normal" pages, albeit with slightly different
				169	* usage rules.
				170	*/
				171	if (pfn_valid(pfn))
				172	return PageReserved(pfn_to_page(pfn)) &&
				173	!kvm_is_zone_device_pfn(pfn);
				174
				175	return true;
				176	}
				177
				178	/*
				179	* Switches to specified vcpu, until a matching vcpu_put()
				180	*/
				181	void vcpu_load(struct kvm_vcpu *vcpu)
				182	{
				183	int cpu = get_cpu();
				184	preempt_notifier_register(&vcpu->preempt_notifier);
				185	kvm_arch_vcpu_load(vcpu, cpu);
				186	put_cpu();
				187	}
				188	EXPORT_SYMBOL_GPL(vcpu_load);
				189
				190	void vcpu_put(struct kvm_vcpu *vcpu)
				191	{
				192	preempt_disable();
				193	kvm_arch_vcpu_put(vcpu);
				194	preempt_notifier_unregister(&vcpu->preempt_notifier);
				195	preempt_enable();
				196	}
				197	EXPORT_SYMBOL_GPL(vcpu_put);
				198
				199	/* TODO: merge with kvm_arch_vcpu_should_kick */
				200	static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
				201	{
				202	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
				203
				204	/*
				205	* We need to wait for the VCPU to reenable interrupts and get out of
				206	* READING_SHADOW_PAGE_TABLES mode.
				207	*/
				208	if (req & KVM_REQUEST_WAIT)
				209	return mode != OUTSIDE_GUEST_MODE;
				210
				211	/*
				212	* Need to kick a running VCPU, but otherwise there is nothing to do.
				213	*/
				214	return mode == IN_GUEST_MODE;
				215	}
				216
				217	static void ack_flush(void *_completed)
				218	{
				219	}
				220
				221	static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
				222	{
				223	if (unlikely(!cpus))
				224	cpus = cpu_online_mask;
				225
				226	if (cpumask_empty(cpus))
				227	return false;
				228
				229	smp_call_function_many(cpus, ack_flush, NULL, wait);
				230	return true;
				231	}
				232
				233	bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
				234	unsigned long *vcpu_bitmap, cpumask_var_t tmp)
				235	{
				236	int i, cpu, me;
				237	struct kvm_vcpu *vcpu;
				238	bool called;
				239
				240	me = get_cpu();
				241
				242	kvm_for_each_vcpu(i, vcpu, kvm) {
				243	if (!test_bit(i, vcpu_bitmap))
				244	continue;
				245
				246	kvm_make_request(req, vcpu);
				247	cpu = vcpu->cpu;
				248
				249	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
				250	continue;
				251
				252	if (tmp != NULL && cpu != -1 && cpu != me &&
				253	kvm_request_needs_ipi(vcpu, req))
				254	__cpumask_set_cpu(cpu, tmp);
				255	}
				256
				257	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
				258	put_cpu();
				259
				260	return called;
				261	}
				262
				263	bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
				264	{
				265	cpumask_var_t cpus;
				266	bool called;
				267	static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
				268	= {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
				269
				270	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
				271
				272	called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
				273
				274	free_cpumask_var(cpus);
				275	return called;
				276	}
				277
				278	#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
				279	void kvm_flush_remote_tlbs(struct kvm *kvm)
				280	{
				281	/*
				282	* Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
				283	* kvm_make_all_cpus_request.
				284	*/
				285	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
				286
				287	/*
				288	* We want to publish modifications to the page tables before reading
				289	* mode. Pairs with a memory barrier in arch-specific code.
				290	* - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
				291	* and smp_mb in walk_shadow_page_lockless_begin/end.
				292	* - powerpc: smp_mb in kvmppc_prepare_to_enter.
				293	*
				294	* There is already an smp_mb__after_atomic() before
				295	* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
				296	* barrier here.
				297	*/
				298	if (!kvm_arch_flush_remote_tlb(kvm)
				299	\|\| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
				300	++kvm->stat.remote_tlb_flush;
				301	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
				302	}
				303	EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
				304	#endif
				305
				306	void kvm_reload_remote_mmus(struct kvm *kvm)
				307	{
				308	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
				309	}
				310
				311	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
				312	{
				313	struct page *page;
				314	int r;
				315
				316	mutex_init(&vcpu->mutex);
				317	vcpu->cpu = -1;
				318	vcpu->kvm = kvm;
				319	vcpu->vcpu_id = id;
				320	vcpu->pid = NULL;
				321	init_swait_queue_head(&vcpu->wq);
				322	kvm_async_pf_vcpu_init(vcpu);
				323
				324	vcpu->pre_pcpu = -1;
				325	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
				326
				327	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				328	if (!page) {
				329	r = -ENOMEM;
				330	goto fail;
				331	}
				332	vcpu->run = page_address(page);
				333
				334	kvm_vcpu_set_in_spin_loop(vcpu, false);
				335	kvm_vcpu_set_dy_eligible(vcpu, false);
				336	vcpu->preempted = false;
				337
				338	r = kvm_arch_vcpu_init(vcpu);
				339	if (r < 0)
				340	goto fail_free_run;
				341	return 0;
				342
				343	fail_free_run:
				344	free_page((unsigned long)vcpu->run);
				345	fail:
				346	return r;
				347	}
				348	EXPORT_SYMBOL_GPL(kvm_vcpu_init);
				349
				350	void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
				351	{
				352	/*
				353	* no need for rcu_read_lock as VCPU_RUN is the only place that
				354	* will change the vcpu->pid pointer and on uninit all file
				355	* descriptors are already gone.
				356	*/
				357	put_pid(rcu_dereference_protected(vcpu->pid, 1));
				358	kvm_arch_vcpu_uninit(vcpu);
				359	free_page((unsigned long)vcpu->run);
				360	}
				361	EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
				362
				363	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				364	static inline struct kvm mmu_notifier_to_kvm(struct mmu_notifier mn)
				365	{
				366	return container_of(mn, struct kvm, mmu_notifier);
				367	}
				368
				369	static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
				370	struct mm_struct *mm,
				371	unsigned long address,
				372	pte_t pte)
				373	{
				374	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				375	int idx;
				376
				377	idx = srcu_read_lock(&kvm->srcu);
				378	spin_lock(&kvm->mmu_lock);
				379	kvm->mmu_notifier_seq++;
				380	kvm_set_spte_hva(kvm, address, pte);
				381	spin_unlock(&kvm->mmu_lock);
				382	srcu_read_unlock(&kvm->srcu, idx);
				383	}
				384
				385	static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
				386	struct mm_struct *mm,
				387	unsigned long start,
				388	unsigned long end,
				389	bool blockable)
				390	{
				391	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				392	int need_tlb_flush = 0, idx;
				393	int ret;
				394
				395	idx = srcu_read_lock(&kvm->srcu);
				396	spin_lock(&kvm->mmu_lock);
				397	/*
				398	* The count increase must become visible at unlock time as no
				399	* spte can be established without taking the mmu_lock and
				400	* count is also read inside the mmu_lock critical section.
				401	*/
				402	kvm->mmu_notifier_count++;
				403	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
				404	need_tlb_flush \|= kvm->tlbs_dirty;
				405	/* we've to flush the tlb before the pages can be freed */
				406	if (need_tlb_flush)
				407	kvm_flush_remote_tlbs(kvm);
				408
				409	spin_unlock(&kvm->mmu_lock);
				410
				411	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable);
				412
				413	srcu_read_unlock(&kvm->srcu, idx);
				414
				415	return ret;
				416	}
				417
				418	static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
				419	struct mm_struct *mm,
				420	unsigned long start,
				421	unsigned long end)
				422	{
				423	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				424
				425	spin_lock(&kvm->mmu_lock);
				426	/*
				427	* This sequence increase will notify the kvm page fault that
				428	* the page that is going to be mapped in the spte could have
				429	* been freed.
				430	*/
				431	kvm->mmu_notifier_seq++;
				432	smp_wmb();
				433	/*
				434	* The above sequence increase must be visible before the
				435	* below count decrease, which is ensured by the smp_wmb above
				436	* in conjunction with the smp_rmb in mmu_notifier_retry().
				437	*/
				438	kvm->mmu_notifier_count--;
				439	spin_unlock(&kvm->mmu_lock);
				440
				441	BUG_ON(kvm->mmu_notifier_count < 0);
				442	}
				443
				444	static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
				445	struct mm_struct *mm,
				446	unsigned long start,
				447	unsigned long end)
				448	{
				449	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				450	int young, idx;
				451
				452	idx = srcu_read_lock(&kvm->srcu);
				453	spin_lock(&kvm->mmu_lock);
				454
				455	young = kvm_age_hva(kvm, start, end);
				456	if (young)
				457	kvm_flush_remote_tlbs(kvm);
				458
				459	spin_unlock(&kvm->mmu_lock);
				460	srcu_read_unlock(&kvm->srcu, idx);
				461
				462	return young;
				463	}
				464
				465	static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
				466	struct mm_struct *mm,
				467	unsigned long start,
				468	unsigned long end)
				469	{
				470	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				471	int young, idx;
				472
				473	idx = srcu_read_lock(&kvm->srcu);
				474	spin_lock(&kvm->mmu_lock);
				475	/*
				476	* Even though we do not flush TLB, this will still adversely
				477	* affect performance on pre-Haswell Intel EPT, where there is
				478	* no EPT Access Bit to clear so that we have to tear down EPT
				479	* tables instead. If we find this unacceptable, we can always
				480	* add a parameter to kvm_age_hva so that it effectively doesn't
				481	* do anything on clear_young.
				482	*
				483	* Also note that currently we never issue secondary TLB flushes
				484	* from clear_young, leaving this job up to the regular system
				485	* cadence. If we find this inaccurate, we might come up with a
				486	* more sophisticated heuristic later.
				487	*/
				488	young = kvm_age_hva(kvm, start, end);
				489	spin_unlock(&kvm->mmu_lock);
				490	srcu_read_unlock(&kvm->srcu, idx);
				491
				492	return young;
				493	}
				494
				495	static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
				496	struct mm_struct *mm,
				497	unsigned long address)
				498	{
				499	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				500	int young, idx;
				501
				502	idx = srcu_read_lock(&kvm->srcu);
				503	spin_lock(&kvm->mmu_lock);
				504	young = kvm_test_age_hva(kvm, address);
				505	spin_unlock(&kvm->mmu_lock);
				506	srcu_read_unlock(&kvm->srcu, idx);
				507
				508	return young;
				509	}
				510
				511	static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
				512	struct mm_struct *mm)
				513	{
				514	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				515	int idx;
				516
				517	idx = srcu_read_lock(&kvm->srcu);
				518	kvm_arch_flush_shadow_all(kvm);
				519	srcu_read_unlock(&kvm->srcu, idx);
				520	}
				521
				522	static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
				523	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
				524	.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
				525	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
				526	.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
				527	.clear_young = kvm_mmu_notifier_clear_young,
				528	.test_young = kvm_mmu_notifier_test_young,
				529	.change_pte = kvm_mmu_notifier_change_pte,
				530	.release = kvm_mmu_notifier_release,
				531	};
				532
				533	static int kvm_init_mmu_notifier(struct kvm *kvm)
				534	{
				535	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
				536	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
				537	}
				538
				539	#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
				540
				541	static int kvm_init_mmu_notifier(struct kvm *kvm)
				542	{
				543	return 0;
				544	}
				545
				546	#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
				547
				548	static struct kvm_memslots *kvm_alloc_memslots(void)
				549	{
				550	int i;
				551	struct kvm_memslots *slots;
				552
				553	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
				554	if (!slots)
				555	return NULL;
				556
				557	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
				558	slots->id_to_index[i] = slots->memslots[i].id = i;
				559
				560	return slots;
				561	}
				562
				563	static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
				564	{
				565	if (!memslot->dirty_bitmap)
				566	return;
				567
				568	kvfree(memslot->dirty_bitmap);
				569	memslot->dirty_bitmap = NULL;
				570	}
				571
				572	/*
				573	* Free any memory in @free but not in @dont.
				574	*/
				575	static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
				576	struct kvm_memory_slot *dont)
				577	{
				578	if (!dont \|\| free->dirty_bitmap != dont->dirty_bitmap)
				579	kvm_destroy_dirty_bitmap(free);
				580
				581	kvm_arch_free_memslot(kvm, free, dont);
				582
				583	free->npages = 0;
				584	}
				585
				586	static void kvm_free_memslots(struct kvm kvm, struct kvm_memslots slots)
				587	{
				588	struct kvm_memory_slot *memslot;
				589
				590	if (!slots)
				591	return;
				592
				593	kvm_for_each_memslot(memslot, slots)
				594	kvm_free_memslot(kvm, memslot, NULL);
				595
				596	kvfree(slots);
				597	}
				598
				599	static void kvm_destroy_vm_debugfs(struct kvm *kvm)
				600	{
				601	int i;
				602
				603	if (!kvm->debugfs_dentry)
				604	return;
				605
				606	debugfs_remove_recursive(kvm->debugfs_dentry);
				607
				608	if (kvm->debugfs_stat_data) {
				609	for (i = 0; i < kvm_debugfs_num_entries; i++)
				610	kfree(kvm->debugfs_stat_data[i]);
				611	kfree(kvm->debugfs_stat_data);
				612	}
				613	}
				614
				615	static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
				616	{
				617	char dir_name[ITOA_MAX_LEN * 2];
				618	struct kvm_stat_data *stat_data;
				619	struct kvm_stats_debugfs_item *p;
				620
				621	if (!debugfs_initialized())
				622	return 0;
				623
				624	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
				625	kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
				626
				627	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
				628	sizeof(*kvm->debugfs_stat_data),
				629	GFP_KERNEL);
				630	if (!kvm->debugfs_stat_data)
				631	return -ENOMEM;
				632
				633	for (p = debugfs_entries; p->name; p++) {
				634	stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
				635	if (!stat_data)
				636	return -ENOMEM;
				637
				638	stat_data->kvm = kvm;
				639	stat_data->offset = p->offset;
				640	stat_data->mode = p->mode ? p->mode : 0644;
				641	kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
				642	debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
				643	stat_data, stat_fops_per_vm[p->kind]);
				644	}
				645	return 0;
				646	}
				647
				648	/*
				649	* Called after the VM is otherwise initialized, but just before adding it to
				650	* the vm_list.
				651	*/
				652	int __weak kvm_arch_post_init_vm(struct kvm *kvm)
				653	{
				654	return 0;
				655	}
				656
				657	/*
				658	* Called just after removing the VM from the vm_list, but before doing any
				659	* other destruction.
				660	*/
				661	void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
				662	{
				663	}
				664
				665	static struct kvm *kvm_create_vm(unsigned long type)
				666	{
				667	int r, i;
				668	struct kvm *kvm = kvm_arch_alloc_vm();
				669
				670	if (!kvm)
				671	return ERR_PTR(-ENOMEM);
				672
				673	spin_lock_init(&kvm->mmu_lock);
				674	mmgrab(current->mm);
				675	kvm->mm = current->mm;
				676	kvm_eventfd_init(kvm);
				677	mutex_init(&kvm->lock);
				678	mutex_init(&kvm->irq_lock);
				679	mutex_init(&kvm->slots_lock);
				680	refcount_set(&kvm->users_count, 1);
				681	INIT_LIST_HEAD(&kvm->devices);
				682
				683	r = kvm_arch_init_vm(kvm, type);
				684	if (r)
				685	goto out_err_no_disable;
				686
				687	r = hardware_enable_all();
				688	if (r)
				689	goto out_err_no_disable;
				690
				691	#ifdef CONFIG_HAVE_KVM_IRQFD
				692	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
				693	#endif
				694
				695	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
				696
				697	r = -ENOMEM;
				698	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				699	struct kvm_memslots *slots = kvm_alloc_memslots();
				700	if (!slots)
				701	goto out_err_no_srcu;
				702	/*
				703	* Generations must be different for each address space.
				704	* Init kvm generation close to the maximum to easily test the
				705	* code of handling generation number wrap-around.
				706	*/
				707	slots->generation = i * 2 - 150;
				708	rcu_assign_pointer(kvm->memslots[i], slots);
				709	}
				710
				711	if (init_srcu_struct(&kvm->srcu))
				712	goto out_err_no_srcu;
				713	if (init_srcu_struct(&kvm->irq_srcu))
				714	goto out_err_no_irq_srcu;
				715	for (i = 0; i < KVM_NR_BUSES; i++) {
				716	rcu_assign_pointer(kvm->buses[i],
				717	kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
				718	if (!kvm->buses[i])
				719	goto out_err_no_mmu_notifier;
				720	}
				721
				722	r = kvm_init_mmu_notifier(kvm);
				723	if (r)
				724	goto out_err_no_mmu_notifier;
				725
				726	r = kvm_arch_post_init_vm(kvm);
				727	if (r)
				728	goto out_err;
				729
				730	mutex_lock(&kvm_lock);
				731	list_add(&kvm->vm_list, &vm_list);
				732	mutex_unlock(&kvm_lock);
				733
				734	preempt_notifier_inc();
				735
				736	return kvm;
				737
				738	out_err:
				739	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				740	if (kvm->mmu_notifier.ops)
				741	mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
				742	#endif
				743	out_err_no_mmu_notifier:
				744	cleanup_srcu_struct(&kvm->irq_srcu);
				745	out_err_no_irq_srcu:
				746	cleanup_srcu_struct(&kvm->srcu);
				747	out_err_no_srcu:
				748	hardware_disable_all();
				749	out_err_no_disable:
				750	refcount_set(&kvm->users_count, 0);
				751	for (i = 0; i < KVM_NR_BUSES; i++)
				752	kfree(kvm_get_bus(kvm, i));
				753	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
				754	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
				755	kvm_arch_free_vm(kvm);
				756	mmdrop(current->mm);
				757	return ERR_PTR(r);
				758	}
				759
				760	static void kvm_destroy_devices(struct kvm *kvm)
				761	{
				762	struct kvm_device dev, tmp;
				763
				764	/*
				765	* We do not need to take the kvm->lock here, because nobody else
				766	* has a reference to the struct kvm at this point and therefore
				767	* cannot access the devices list anyhow.
				768	*/
				769	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
				770	list_del(&dev->vm_node);
				771	dev->ops->destroy(dev);
				772	}
				773	}
				774
				775	static void kvm_destroy_vm(struct kvm *kvm)
				776	{
				777	int i;
				778	struct mm_struct *mm = kvm->mm;
				779
				780	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
				781	kvm_destroy_vm_debugfs(kvm);
				782	kvm_arch_sync_events(kvm);
				783	mutex_lock(&kvm_lock);
				784	list_del(&kvm->vm_list);
				785	mutex_unlock(&kvm_lock);
				786	kvm_arch_pre_destroy_vm(kvm);
				787
				788	kvm_free_irq_routing(kvm);
				789	for (i = 0; i < KVM_NR_BUSES; i++) {
				790	struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
				791
				792	if (bus)
				793	kvm_io_bus_destroy(bus);
				794	kvm->buses[i] = NULL;
				795	}
				796	kvm_coalesced_mmio_free(kvm);
				797	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				798	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
				799	#else
				800	kvm_arch_flush_shadow_all(kvm);
				801	#endif
				802	kvm_arch_destroy_vm(kvm);
				803	kvm_destroy_devices(kvm);
				804	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
				805	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
				806	cleanup_srcu_struct(&kvm->irq_srcu);
				807	cleanup_srcu_struct(&kvm->srcu);
				808	kvm_arch_free_vm(kvm);
				809	preempt_notifier_dec();
				810	hardware_disable_all();
				811	mmdrop(mm);
				812	}
				813
				814	void kvm_get_kvm(struct kvm *kvm)
				815	{
				816	refcount_inc(&kvm->users_count);
				817	}
				818	EXPORT_SYMBOL_GPL(kvm_get_kvm);
				819
				820	void kvm_put_kvm(struct kvm *kvm)
				821	{
				822	if (refcount_dec_and_test(&kvm->users_count))
				823	kvm_destroy_vm(kvm);
				824	}
				825	EXPORT_SYMBOL_GPL(kvm_put_kvm);
				826
				827
				828	static int kvm_vm_release(struct inode inode, struct file filp)
				829	{
				830	struct kvm *kvm = filp->private_data;
				831
				832	kvm_irqfd_release(kvm);
				833
				834	kvm_put_kvm(kvm);
				835	return 0;
				836	}
				837
				838	/*
				839	* Allocation size is twice as large as the actual dirty bitmap size.
				840	* See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
				841	*/
				842	static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
				843	{
				844	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
				845
				846	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
				847	if (!memslot->dirty_bitmap)
				848	return -ENOMEM;
				849
				850	return 0;
				851	}
				852
				853	/*
				854	* Insert memslot and re-sort memslots based on their GFN,
				855	* so binary search could be used to lookup GFN.
				856	* Sorting algorithm takes advantage of having initially
				857	* sorted array and known changed memslot position.
				858	*/
				859	static void update_memslots(struct kvm_memslots *slots,
				860	struct kvm_memory_slot *new)
				861	{
				862	int id = new->id;
				863	int i = slots->id_to_index[id];
				864	struct kvm_memory_slot *mslots = slots->memslots;
				865
				866	WARN_ON(mslots[i].id != id);
				867	if (!new->npages) {
				868	WARN_ON(!mslots[i].npages);
				869	if (mslots[i].npages)
				870	slots->used_slots--;
				871	} else {
				872	if (!mslots[i].npages)
				873	slots->used_slots++;
				874	}
				875
				876	while (i < KVM_MEM_SLOTS_NUM - 1 &&
				877	new->base_gfn <= mslots[i + 1].base_gfn) {
				878	if (!mslots[i + 1].npages)
				879	break;
				880	mslots[i] = mslots[i + 1];
				881	slots->id_to_index[mslots[i].id] = i;
				882	i++;
				883	}
				884
				885	/*
				886	* The ">=" is needed when creating a slot with base_gfn == 0,
				887	* so that it moves before all those with base_gfn == npages == 0.
				888	*
				889	* On the other hand, if new->npages is zero, the above loop has
				890	* already left i pointing to the beginning of the empty part of
				891	* mslots, and the ">=" would move the hole backwards in this
				892	* case---which is wrong. So skip the loop when deleting a slot.
				893	*/
				894	if (new->npages) {
				895	while (i > 0 &&
				896	new->base_gfn >= mslots[i - 1].base_gfn) {
				897	mslots[i] = mslots[i - 1];
				898	slots->id_to_index[mslots[i].id] = i;
				899	i--;
				900	}
				901	} else
				902	WARN_ON_ONCE(i != slots->used_slots);
				903
				904	mslots[i] = *new;
				905	slots->id_to_index[mslots[i].id] = i;
				906	}
				907
				908	static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
				909	{
				910	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
				911
				912	#ifdef __KVM_HAVE_READONLY_MEM
				913	valid_flags \|= KVM_MEM_READONLY;
				914	#endif
				915
				916	if (mem->flags & ~valid_flags)
				917	return -EINVAL;
				918
				919	return 0;
				920	}
				921
				922	static struct kvm_memslots install_new_memslots(struct kvm kvm,
				923	int as_id, struct kvm_memslots *slots)
				924	{
				925	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
				926	u64 gen;
				927
				928	/*
				929	* Set the low bit in the generation, which disables SPTE caching
				930	* until the end of synchronize_srcu_expedited.
				931	*/
				932	WARN_ON(old_memslots->generation & 1);
				933	slots->generation = old_memslots->generation + 1;
				934
				935	rcu_assign_pointer(kvm->memslots[as_id], slots);
				936	synchronize_srcu_expedited(&kvm->srcu);
				937
				938	/*
				939	* Increment the new memslot generation a second time. This prevents
				940	* vm exits that race with memslot updates from caching a memslot
				941	* generation that will (potentially) be valid forever.
				942	*
				943	* Generations must be unique even across address spaces. We do not need
				944	* a global counter for that, instead the generation space is evenly split
				945	* across address spaces. For example, with two address spaces, address
				946	* space 0 will use generations 0, 4, 8, ... while * address space 1 will
				947	* use generations 2, 6, 10, 14, ...
				948	*/
				949	gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
				950
				951	kvm_arch_memslots_updated(kvm, gen);
				952
				953	slots->generation = gen;
				954
				955	return old_memslots;
				956	}
				957
				958	/*
				959	* Allocate some memory and give it an address in the guest physical address
				960	* space.
				961	*
				962	* Discontiguous memory is allowed, mostly for framebuffers.
				963	*
				964	* Must be called holding kvm->slots_lock for write.
				965	*/
				966	int __kvm_set_memory_region(struct kvm *kvm,
				967	const struct kvm_userspace_memory_region *mem)
				968	{
				969	int r;
				970	gfn_t base_gfn;
				971	unsigned long npages;
				972	struct kvm_memory_slot *slot;
				973	struct kvm_memory_slot old, new;
				974	struct kvm_memslots slots = NULL, old_memslots;
				975	int as_id, id;
				976	enum kvm_mr_change change;
				977
				978	r = check_memory_region_flags(mem);
				979	if (r)
				980	goto out;
				981
				982	r = -EINVAL;
				983	as_id = mem->slot >> 16;
				984	id = (u16)mem->slot;
				985
				986	/* General sanity checks */
				987	if (mem->memory_size & (PAGE_SIZE - 1))
				988	goto out;
				989	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
				990	goto out;
				991	/* We can read the guest memory with __xxx_user() later on. */
				992	if ((id < KVM_USER_MEM_SLOTS) &&
				993	((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
				994	!access_ok(VERIFY_WRITE,
				995	(void __user *)(unsigned long)mem->userspace_addr,
				996	mem->memory_size)))
				997	goto out;
				998	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_MEM_SLOTS_NUM)
				999	goto out;
				1000	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
				1001	goto out;
				1002
				1003	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
				1004	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
				1005	npages = mem->memory_size >> PAGE_SHIFT;
				1006
				1007	if (npages > KVM_MEM_MAX_NR_PAGES)
				1008	goto out;
				1009
				1010	new = old = *slot;
				1011
				1012	new.id = id;
				1013	new.base_gfn = base_gfn;
				1014	new.npages = npages;
				1015	new.flags = mem->flags;
				1016
				1017	if (npages) {
				1018	if (!old.npages)
				1019	change = KVM_MR_CREATE;
				1020	else { /* Modify an existing slot. */
				1021	if ((mem->userspace_addr != old.userspace_addr) \|\|
				1022	(npages != old.npages) \|\|
				1023	((new.flags ^ old.flags) & KVM_MEM_READONLY))
				1024	goto out;
				1025
				1026	if (base_gfn != old.base_gfn)
				1027	change = KVM_MR_MOVE;
				1028	else if (new.flags != old.flags)
				1029	change = KVM_MR_FLAGS_ONLY;
				1030	else { /* Nothing to change. */
				1031	r = 0;
				1032	goto out;
				1033	}
				1034	}
				1035	} else {
				1036	if (!old.npages)
				1037	goto out;
				1038
				1039	change = KVM_MR_DELETE;
				1040	new.base_gfn = 0;
				1041	new.flags = 0;
				1042	}
				1043
				1044	if ((change == KVM_MR_CREATE) \|\| (change == KVM_MR_MOVE)) {
				1045	/* Check for overlaps */
				1046	r = -EEXIST;
				1047	kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
				1048	if (slot->id == id)
				1049	continue;
				1050	if (!((base_gfn + npages <= slot->base_gfn) \|\|
				1051	(base_gfn >= slot->base_gfn + slot->npages)))
				1052	goto out;
				1053	}
				1054	}
				1055
				1056	/* Free page dirty bitmap if unneeded */
				1057	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
				1058	new.dirty_bitmap = NULL;
				1059
				1060	r = -ENOMEM;
				1061	if (change == KVM_MR_CREATE) {
				1062	new.userspace_addr = mem->userspace_addr;
				1063
				1064	if (kvm_arch_create_memslot(kvm, &new, npages))
				1065	goto out_free;
				1066	}
				1067
				1068	/* Allocate page dirty bitmap if needed */
				1069	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
				1070	if (kvm_create_dirty_bitmap(&new) < 0)
				1071	goto out_free;
				1072	}
				1073
				1074	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
				1075	if (!slots)
				1076	goto out_free;
				1077	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
				1078
				1079	if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {
				1080	slot = id_to_memslot(slots, id);
				1081	slot->flags \|= KVM_MEMSLOT_INVALID;
				1082
				1083	old_memslots = install_new_memslots(kvm, as_id, slots);
				1084
				1085	/* From this point no new shadow pages pointing to a deleted,
				1086	* or moved, memslot will be created.
				1087	*
				1088	* validation of sp->gfn happens in:
				1089	* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
				1090	* - kvm_is_visible_gfn (mmu_check_roots)
				1091	*/
				1092	kvm_arch_flush_shadow_memslot(kvm, slot);
				1093
				1094	/*
				1095	* We can re-use the old_memslots from above, the only difference
				1096	* from the currently installed memslots is the invalid flag. This
				1097	* will get overwritten by update_memslots anyway.
				1098	*/
				1099	slots = old_memslots;
				1100	}
				1101
				1102	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
				1103	if (r)
				1104	goto out_slots;
				1105
				1106	/* actual memory is freed via old in kvm_free_memslot below */
				1107	if (change == KVM_MR_DELETE) {
				1108	new.dirty_bitmap = NULL;
				1109	memset(&new.arch, 0, sizeof(new.arch));
				1110	}
				1111
				1112	update_memslots(slots, &new);
				1113	old_memslots = install_new_memslots(kvm, as_id, slots);
				1114
				1115	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
				1116
				1117	kvm_free_memslot(kvm, &old, &new);
				1118	kvfree(old_memslots);
				1119	return 0;
				1120
				1121	out_slots:
				1122	kvfree(slots);
				1123	out_free:
				1124	kvm_free_memslot(kvm, &new, &old);
				1125	out:
				1126	return r;
				1127	}
				1128	EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
				1129
				1130	int kvm_set_memory_region(struct kvm *kvm,
				1131	const struct kvm_userspace_memory_region *mem)
				1132	{
				1133	int r;
				1134
				1135	mutex_lock(&kvm->slots_lock);
				1136	r = __kvm_set_memory_region(kvm, mem);
				1137	mutex_unlock(&kvm->slots_lock);
				1138	return r;
				1139	}
				1140	EXPORT_SYMBOL_GPL(kvm_set_memory_region);
				1141
				1142	static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
				1143	struct kvm_userspace_memory_region *mem)
				1144	{
				1145	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
				1146	return -EINVAL;
				1147
				1148	return kvm_set_memory_region(kvm, mem);
				1149	}
				1150
				1151	int kvm_get_dirty_log(struct kvm *kvm,
				1152	struct kvm_dirty_log log, int is_dirty)
				1153	{
				1154	struct kvm_memslots *slots;
				1155	struct kvm_memory_slot *memslot;
				1156	int i, as_id, id;
				1157	unsigned long n;
				1158	unsigned long any = 0;
				1159
				1160	as_id = log->slot >> 16;
				1161	id = (u16)log->slot;
				1162	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1163	return -EINVAL;
				1164
				1165	slots = __kvm_memslots(kvm, as_id);
				1166	memslot = id_to_memslot(slots, id);
				1167	if (!memslot->dirty_bitmap)
				1168	return -ENOENT;
				1169
				1170	n = kvm_dirty_bitmap_bytes(memslot);
				1171
				1172	for (i = 0; !any && i < n/sizeof(long); ++i)
				1173	any = memslot->dirty_bitmap[i];
				1174
				1175	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
				1176	return -EFAULT;
				1177
				1178	if (any)
				1179	*is_dirty = 1;
				1180	return 0;
				1181	}
				1182	EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
				1183
				1184	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				1185	/**
				1186	* kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
				1187	* are dirty write protect them for next write.
				1188	* @kvm: pointer to kvm instance
				1189	* @log: slot id and address to which we copy the log
				1190	* @is_dirty: flag set if any page is dirty
				1191	*
				1192	* We need to keep it in mind that VCPU threads can write to the bitmap
				1193	* concurrently. So, to avoid losing track of dirty pages we keep the
				1194	* following order:
				1195	*
				1196	* 1. Take a snapshot of the bit and clear it if needed.
				1197	* 2. Write protect the corresponding page.
				1198	* 3. Copy the snapshot to the userspace.
				1199	* 4. Upon return caller flushes TLB's if needed.
				1200	*
				1201	* Between 2 and 4, the guest may write to the page using the remaining TLB
				1202	* entry. This is not a problem because the page is reported dirty using
				1203	* the snapshot taken before and step 4 ensures that writes done after
				1204	* exiting to userspace will be logged for the next call.
				1205	*
				1206	*/
				1207	int kvm_get_dirty_log_protect(struct kvm *kvm,
				1208	struct kvm_dirty_log log, bool is_dirty)
				1209	{
				1210	struct kvm_memslots *slots;
				1211	struct kvm_memory_slot *memslot;
				1212	int i, as_id, id;
				1213	unsigned long n;
				1214	unsigned long *dirty_bitmap;
				1215	unsigned long *dirty_bitmap_buffer;
				1216
				1217	as_id = log->slot >> 16;
				1218	id = (u16)log->slot;
				1219	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1220	return -EINVAL;
				1221
				1222	slots = __kvm_memslots(kvm, as_id);
				1223	memslot = id_to_memslot(slots, id);
				1224
				1225	dirty_bitmap = memslot->dirty_bitmap;
				1226	if (!dirty_bitmap)
				1227	return -ENOENT;
				1228
				1229	n = kvm_dirty_bitmap_bytes(memslot);
				1230
				1231	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
				1232	memset(dirty_bitmap_buffer, 0, n);
				1233
				1234	spin_lock(&kvm->mmu_lock);
				1235	*is_dirty = false;
				1236	for (i = 0; i < n / sizeof(long); i++) {
				1237	unsigned long mask;
				1238	gfn_t offset;
				1239
				1240	if (!dirty_bitmap[i])
				1241	continue;
				1242
				1243	*is_dirty = true;
				1244
				1245	mask = xchg(&dirty_bitmap[i], 0);
				1246	dirty_bitmap_buffer[i] = mask;
				1247
				1248	if (mask) {
				1249	offset = i * BITS_PER_LONG;
				1250	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
				1251	offset, mask);
				1252	}
				1253	}
				1254
				1255	spin_unlock(&kvm->mmu_lock);
				1256	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
				1257	return -EFAULT;
				1258	return 0;
				1259	}
				1260	EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
				1261	#endif
				1262
				1263	bool kvm_largepages_enabled(void)
				1264	{
				1265	return largepages_enabled;
				1266	}
				1267
				1268	void kvm_disable_largepages(void)
				1269	{
				1270	largepages_enabled = false;
				1271	}
				1272	EXPORT_SYMBOL_GPL(kvm_disable_largepages);
				1273
				1274	struct kvm_memory_slot gfn_to_memslot(struct kvm kvm, gfn_t gfn)
				1275	{
				1276	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
				1277	}
				1278	EXPORT_SYMBOL_GPL(gfn_to_memslot);
				1279
				1280	struct kvm_memory_slot kvm_vcpu_gfn_to_memslot(struct kvm_vcpu vcpu, gfn_t gfn)
				1281	{
				1282	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
				1283	}
				1284
				1285	bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
				1286	{
				1287	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
				1288
				1289	if (!memslot \|\| memslot->id >= KVM_USER_MEM_SLOTS \|\|
				1290	memslot->flags & KVM_MEMSLOT_INVALID)
				1291	return false;
				1292
				1293	return true;
				1294	}
				1295	EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
				1296
				1297	unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
				1298	{
				1299	struct vm_area_struct *vma;
				1300	unsigned long addr, size;
				1301
				1302	size = PAGE_SIZE;
				1303
				1304	addr = gfn_to_hva(kvm, gfn);
				1305	if (kvm_is_error_hva(addr))
				1306	return PAGE_SIZE;
				1307
				1308	down_read(&current->mm->mmap_sem);
				1309	vma = find_vma(current->mm, addr);
				1310	if (!vma)
				1311	goto out;
				1312
				1313	size = vma_kernel_pagesize(vma);
				1314
				1315	out:
				1316	up_read(&current->mm->mmap_sem);
				1317
				1318	return size;
				1319	}
				1320
				1321	static bool memslot_is_readonly(struct kvm_memory_slot *slot)
				1322	{
				1323	return slot->flags & KVM_MEM_READONLY;
				1324	}
				1325
				1326	static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
				1327	gfn_t *nr_pages, bool write)
				1328	{
				1329	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
				1330	return KVM_HVA_ERR_BAD;
				1331
				1332	if (memslot_is_readonly(slot) && write)
				1333	return KVM_HVA_ERR_RO_BAD;
				1334
				1335	if (nr_pages)
				1336	*nr_pages = slot->npages - (gfn - slot->base_gfn);
				1337
				1338	return __gfn_to_hva_memslot(slot, gfn);
				1339	}
				1340
				1341	static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
				1342	gfn_t *nr_pages)
				1343	{
				1344	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
				1345	}
				1346
				1347	unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
				1348	gfn_t gfn)
				1349	{
				1350	return gfn_to_hva_many(slot, gfn, NULL);
				1351	}
				1352	EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
				1353
				1354	unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
				1355	{
				1356	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
				1357	}
				1358	EXPORT_SYMBOL_GPL(gfn_to_hva);
				1359
				1360	unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
				1361	{
				1362	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
				1363	}
				1364	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
				1365
				1366	/*
				1367	* If writable is set to false, the hva returned by this function is only
				1368	* allowed to be read.
				1369	*/
				1370	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
				1371	gfn_t gfn, bool *writable)
				1372	{
				1373	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
				1374
				1375	if (!kvm_is_error_hva(hva) && writable)
				1376	*writable = !memslot_is_readonly(slot);
				1377
				1378	return hva;
				1379	}
				1380
				1381	unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)
				1382	{
				1383	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1384
				1385	return gfn_to_hva_memslot_prot(slot, gfn, writable);
				1386	}
				1387
				1388	unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu vcpu, gfn_t gfn, bool writable)
				1389	{
				1390	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1391
				1392	return gfn_to_hva_memslot_prot(slot, gfn, writable);
				1393	}
				1394
				1395	static inline int check_user_page_hwpoison(unsigned long addr)
				1396	{
				1397	int rc, flags = FOLL_HWPOISON \| FOLL_WRITE;
				1398
				1399	rc = get_user_pages(addr, 1, flags, NULL, NULL);
				1400	return rc == -EHWPOISON;
				1401	}
				1402
				1403	/*
				1404	* The fast path to get the writable pfn which will be stored in @pfn,
				1405	* true indicates success, otherwise false is returned. It's also the
				1406	* only part that runs if we can are in atomic context.
				1407	*/
				1408	static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
				1409	bool writable, kvm_pfn_t pfn)
				1410	{
				1411	struct page *page[1];
				1412	int npages;
				1413
				1414	/*
				1415	* Fast pin a writable pfn only if it is a write fault request
				1416	* or the caller allows to map a writable pfn for a read fault
				1417	* request.
				1418	*/
				1419	if (!(write_fault \|\| writable))
				1420	return false;
				1421
				1422	npages = __get_user_pages_fast(addr, 1, 1, page);
				1423	if (npages == 1) {
				1424	*pfn = page_to_pfn(page[0]);
				1425
				1426	if (writable)
				1427	*writable = true;
				1428	return true;
				1429	}
				1430
				1431	return false;
				1432	}
				1433
				1434	/*
				1435	* The slow path to get the pfn of the specified host virtual address,
				1436	* 1 indicates success, -errno is returned if error is detected.
				1437	*/
				1438	static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
				1439	bool writable, kvm_pfn_t pfn)
				1440	{
				1441	unsigned int flags = FOLL_HWPOISON;
				1442	struct page *page;
				1443	int npages = 0;
				1444
				1445	might_sleep();
				1446
				1447	if (writable)
				1448	*writable = write_fault;
				1449
				1450	if (write_fault)
				1451	flags \|= FOLL_WRITE;
				1452	if (async)
				1453	flags \|= FOLL_NOWAIT;
				1454
				1455	npages = get_user_pages_unlocked(addr, 1, &page, flags);
				1456	if (npages != 1)
				1457	return npages;
				1458
				1459	/* map read fault as writable if possible */
				1460	if (unlikely(!write_fault) && writable) {
				1461	struct page *wpage;
				1462
				1463	if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
				1464	*writable = true;
				1465	put_page(page);
				1466	page = wpage;
				1467	}
				1468	}
				1469	*pfn = page_to_pfn(page);
				1470	return npages;
				1471	}
				1472
				1473	static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
				1474	{
				1475	if (unlikely(!(vma->vm_flags & VM_READ)))
				1476	return false;
				1477
				1478	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
				1479	return false;
				1480
				1481	return true;
				1482	}
				1483
				1484	static int hva_to_pfn_remapped(struct vm_area_struct *vma,
				1485	unsigned long addr, bool *async,
				1486	bool write_fault, bool *writable,
				1487	kvm_pfn_t *p_pfn)
				1488	{
				1489	unsigned long pfn;
				1490	int r;
				1491
				1492	r = follow_pfn(vma, addr, &pfn);
				1493	if (r) {
				1494	/*
				1495	* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
				1496	* not call the fault handler, so do it here.
				1497	*/
				1498	bool unlocked = false;
				1499	r = fixup_user_fault(current, current->mm, addr,
				1500	(write_fault ? FAULT_FLAG_WRITE : 0),
				1501	&unlocked);
				1502	if (unlocked)
				1503	return -EAGAIN;
				1504	if (r)
				1505	return r;
				1506
				1507	r = follow_pfn(vma, addr, &pfn);
				1508	if (r)
				1509	return r;
				1510
				1511	}
				1512
				1513	if (writable)
				1514	*writable = true;
				1515
				1516	/*
				1517	* Get a reference here because callers of hva_to_pfn and
				1518	* gfn_to_pfn ultimately call kvm_release_pfn_clean on the
				1519	* returned pfn. This is only needed if the VMA has VM_MIXEDMAP
				1520	* set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
				1521	* simply do nothing for reserved pfns.
				1522	*
				1523	* Whoever called remap_pfn_range is also going to call e.g.
				1524	* unmap_mapping_range before the underlying pages are freed,
				1525	* causing a call to our MMU notifier.
				1526	*/
				1527	kvm_get_pfn(pfn);
				1528
				1529	*p_pfn = pfn;
				1530	return 0;
				1531	}
				1532
				1533	/*
				1534	* Pin guest page in memory and return its pfn.
				1535	* @addr: host virtual address which maps memory to the guest
				1536	* @atomic: whether this function can sleep
				1537	* @async: whether this function need to wait IO complete if the
				1538	* host page is not in the memory
				1539	* @write_fault: whether we should get a writable host page
				1540	* @writable: whether it allows to map a writable host page for !@write_fault
				1541	*
				1542	* The function will map a writable host page for these two cases:
				1543	* 1): @write_fault = true
				1544	* 2): @write_fault = false && @writable, @writable will tell the caller
				1545	* whether the mapping is writable.
				1546	*/
				1547	static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
				1548	bool write_fault, bool *writable)
				1549	{
				1550	struct vm_area_struct *vma;
				1551	kvm_pfn_t pfn = 0;
				1552	int npages, r;
				1553
				1554	/* we can do it either atomically or asynchronously, not both */
				1555	BUG_ON(atomic && async);
				1556
				1557	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
				1558	return pfn;
				1559
				1560	if (atomic)
				1561	return KVM_PFN_ERR_FAULT;
				1562
				1563	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
				1564	if (npages == 1)
				1565	return pfn;
				1566
				1567	down_read(&current->mm->mmap_sem);
				1568	if (npages == -EHWPOISON \|\|
				1569	(!async && check_user_page_hwpoison(addr))) {
				1570	pfn = KVM_PFN_ERR_HWPOISON;
				1571	goto exit;
				1572	}
				1573
				1574	retry:
				1575	vma = find_vma_intersection(current->mm, addr, addr + 1);
				1576
				1577	if (vma == NULL)
				1578	pfn = KVM_PFN_ERR_FAULT;
				1579	else if (vma->vm_flags & (VM_IO \| VM_PFNMAP)) {
				1580	r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
				1581	if (r == -EAGAIN)
				1582	goto retry;
				1583	if (r < 0)
				1584	pfn = KVM_PFN_ERR_FAULT;
				1585	} else {
				1586	if (async && vma_is_valid(vma, write_fault))
				1587	*async = true;
				1588	pfn = KVM_PFN_ERR_FAULT;
				1589	}
				1590	exit:
				1591	up_read(&current->mm->mmap_sem);
				1592	return pfn;
				1593	}
				1594
				1595	kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
				1596	bool atomic, bool *async, bool write_fault,
				1597	bool *writable)
				1598	{
				1599	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
				1600
				1601	if (addr == KVM_HVA_ERR_RO_BAD) {
				1602	if (writable)
				1603	*writable = false;
				1604	return KVM_PFN_ERR_RO_FAULT;
				1605	}
				1606
				1607	if (kvm_is_error_hva(addr)) {
				1608	if (writable)
				1609	*writable = false;
				1610	return KVM_PFN_NOSLOT;
				1611	}
				1612
				1613	/* Do not map writable pfn in the readonly memslot. */
				1614	if (writable && memslot_is_readonly(slot)) {
				1615	*writable = false;
				1616	writable = NULL;
				1617	}
				1618
				1619	return hva_to_pfn(addr, atomic, async, write_fault,
				1620	writable);
				1621	}
				1622	EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
				1623
				1624	kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
				1625	bool *writable)
				1626	{
				1627	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
				1628	write_fault, writable);
				1629	}
				1630	EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
				1631
				1632	kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
				1633	{
				1634	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
				1635	}
				1636	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
				1637
				1638	kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
				1639	{
				1640	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
				1641	}
				1642	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
				1643
				1644	kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
				1645	{
				1646	return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
				1647	}
				1648	EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
				1649
				1650	kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
				1651	{
				1652	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
				1653	}
				1654	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
				1655
				1656	kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
				1657	{
				1658	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
				1659	}
				1660	EXPORT_SYMBOL_GPL(gfn_to_pfn);
				1661
				1662	kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				1663	{
				1664	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
				1665	}
				1666	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
				1667
				1668	int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
				1669	struct page **pages, int nr_pages)
				1670	{
				1671	unsigned long addr;
				1672	gfn_t entry = 0;
				1673
				1674	addr = gfn_to_hva_many(slot, gfn, &entry);
				1675	if (kvm_is_error_hva(addr))
				1676	return -1;
				1677
				1678	if (entry < nr_pages)
				1679	return 0;
				1680
				1681	return __get_user_pages_fast(addr, nr_pages, 1, pages);
				1682	}
				1683	EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
				1684
				1685	static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
				1686	{
				1687	if (is_error_noslot_pfn(pfn))
				1688	return KVM_ERR_PTR_BAD_PAGE;
				1689
				1690	if (kvm_is_reserved_pfn(pfn)) {
				1691	WARN_ON(1);
				1692	return KVM_ERR_PTR_BAD_PAGE;
				1693	}
				1694
				1695	return pfn_to_page(pfn);
				1696	}
				1697
				1698	struct page gfn_to_page(struct kvm kvm, gfn_t gfn)
				1699	{
				1700	kvm_pfn_t pfn;
				1701
				1702	pfn = gfn_to_pfn(kvm, gfn);
				1703
				1704	return kvm_pfn_to_page(pfn);
				1705	}
				1706	EXPORT_SYMBOL_GPL(gfn_to_page);
				1707
				1708	struct page kvm_vcpu_gfn_to_page(struct kvm_vcpu vcpu, gfn_t gfn)
				1709	{
				1710	kvm_pfn_t pfn;
				1711
				1712	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
				1713
				1714	return kvm_pfn_to_page(pfn);
				1715	}
				1716	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
				1717
				1718	void kvm_release_page_clean(struct page *page)
				1719	{
				1720	WARN_ON(is_error_page(page));
				1721
				1722	kvm_release_pfn_clean(page_to_pfn(page));
				1723	}
				1724	EXPORT_SYMBOL_GPL(kvm_release_page_clean);
				1725
				1726	void kvm_release_pfn_clean(kvm_pfn_t pfn)
				1727	{
				1728	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
				1729	put_page(pfn_to_page(pfn));
				1730	}
				1731	EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
				1732
				1733	void kvm_release_page_dirty(struct page *page)
				1734	{
				1735	WARN_ON(is_error_page(page));
				1736
				1737	kvm_release_pfn_dirty(page_to_pfn(page));
				1738	}
				1739	EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
				1740
				1741	void kvm_release_pfn_dirty(kvm_pfn_t pfn)
				1742	{
				1743	kvm_set_pfn_dirty(pfn);
				1744	kvm_release_pfn_clean(pfn);
				1745	}
				1746	EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
				1747
				1748	void kvm_set_pfn_dirty(kvm_pfn_t pfn)
				1749	{
				1750	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
				1751	struct page *page = pfn_to_page(pfn);
				1752
				1753	if (!PageReserved(page))
				1754	SetPageDirty(page);
				1755	}
				1756	}
				1757	EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
				1758
				1759	void kvm_set_pfn_accessed(kvm_pfn_t pfn)
				1760	{
				1761	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
				1762	mark_page_accessed(pfn_to_page(pfn));
				1763	}
				1764	EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
				1765
				1766	void kvm_get_pfn(kvm_pfn_t pfn)
				1767	{
				1768	if (!kvm_is_reserved_pfn(pfn))
				1769	get_page(pfn_to_page(pfn));
				1770	}
				1771	EXPORT_SYMBOL_GPL(kvm_get_pfn);
				1772
				1773	static int next_segment(unsigned long len, int offset)
				1774	{
				1775	if (len > PAGE_SIZE - offset)
				1776	return PAGE_SIZE - offset;
				1777	else
				1778	return len;
				1779	}
				1780
				1781	static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
				1782	void *data, int offset, int len)
				1783	{
				1784	int r;
				1785	unsigned long addr;
				1786
				1787	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
				1788	if (kvm_is_error_hva(addr))
				1789	return -EFAULT;
				1790	r = __copy_from_user(data, (void __user *)addr + offset, len);
				1791	if (r)
				1792	return -EFAULT;
				1793	return 0;
				1794	}
				1795
				1796	int kvm_read_guest_page(struct kvm kvm, gfn_t gfn, void data, int offset,
				1797	int len)
				1798	{
				1799	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1800
				1801	return __kvm_read_guest_page(slot, gfn, data, offset, len);
				1802	}
				1803	EXPORT_SYMBOL_GPL(kvm_read_guest_page);
				1804
				1805	int kvm_vcpu_read_guest_page(struct kvm_vcpu vcpu, gfn_t gfn, void data,
				1806	int offset, int len)
				1807	{
				1808	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1809
				1810	return __kvm_read_guest_page(slot, gfn, data, offset, len);
				1811	}
				1812	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
				1813
				1814	int kvm_read_guest(struct kvm kvm, gpa_t gpa, void data, unsigned long len)
				1815	{
				1816	gfn_t gfn = gpa >> PAGE_SHIFT;
				1817	int seg;
				1818	int offset = offset_in_page(gpa);
				1819	int ret;
				1820
				1821	while ((seg = next_segment(len, offset)) != 0) {
				1822	ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
				1823	if (ret < 0)
				1824	return ret;
				1825	offset = 0;
				1826	len -= seg;
				1827	data += seg;
				1828	++gfn;
				1829	}
				1830	return 0;
				1831	}
				1832	EXPORT_SYMBOL_GPL(kvm_read_guest);
				1833
				1834	int kvm_vcpu_read_guest(struct kvm_vcpu vcpu, gpa_t gpa, void data, unsigned long len)
				1835	{
				1836	gfn_t gfn = gpa >> PAGE_SHIFT;
				1837	int seg;
				1838	int offset = offset_in_page(gpa);
				1839	int ret;
				1840
				1841	while ((seg = next_segment(len, offset)) != 0) {
				1842	ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
				1843	if (ret < 0)
				1844	return ret;
				1845	offset = 0;
				1846	len -= seg;
				1847	data += seg;
				1848	++gfn;
				1849	}
				1850	return 0;
				1851	}
				1852	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
				1853
				1854	static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
				1855	void *data, int offset, unsigned long len)
				1856	{
				1857	int r;
				1858	unsigned long addr;
				1859
				1860	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
				1861	if (kvm_is_error_hva(addr))
				1862	return -EFAULT;
				1863	pagefault_disable();
				1864	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
				1865	pagefault_enable();
				1866	if (r)
				1867	return -EFAULT;
				1868	return 0;
				1869	}
				1870
				1871	int kvm_read_guest_atomic(struct kvm kvm, gpa_t gpa, void data,
				1872	unsigned long len)
				1873	{
				1874	gfn_t gfn = gpa >> PAGE_SHIFT;
				1875	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1876	int offset = offset_in_page(gpa);
				1877
				1878	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
				1879	}
				1880	EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
				1881
				1882	int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
				1883	void *data, unsigned long len)
				1884	{
				1885	gfn_t gfn = gpa >> PAGE_SHIFT;
				1886	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1887	int offset = offset_in_page(gpa);
				1888
				1889	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
				1890	}
				1891	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
				1892
				1893	static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
				1894	const void *data, int offset, int len)
				1895	{
				1896	int r;
				1897	unsigned long addr;
				1898
				1899	addr = gfn_to_hva_memslot(memslot, gfn);
				1900	if (kvm_is_error_hva(addr))
				1901	return -EFAULT;
				1902	r = __copy_to_user((void __user *)addr + offset, data, len);
				1903	if (r)
				1904	return -EFAULT;
				1905	mark_page_dirty_in_slot(memslot, gfn);
				1906	return 0;
				1907	}
				1908
				1909	int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
				1910	const void *data, int offset, int len)
				1911	{
				1912	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1913
				1914	return __kvm_write_guest_page(slot, gfn, data, offset, len);
				1915	}
				1916	EXPORT_SYMBOL_GPL(kvm_write_guest_page);
				1917
				1918	int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
				1919	const void *data, int offset, int len)
				1920	{
				1921	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1922
				1923	return __kvm_write_guest_page(slot, gfn, data, offset, len);
				1924	}
				1925	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
				1926
				1927	int kvm_write_guest(struct kvm kvm, gpa_t gpa, const void data,
				1928	unsigned long len)
				1929	{
				1930	gfn_t gfn = gpa >> PAGE_SHIFT;
				1931	int seg;
				1932	int offset = offset_in_page(gpa);
				1933	int ret;
				1934
				1935	while ((seg = next_segment(len, offset)) != 0) {
				1936	ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
				1937	if (ret < 0)
				1938	return ret;
				1939	offset = 0;
				1940	len -= seg;
				1941	data += seg;
				1942	++gfn;
				1943	}
				1944	return 0;
				1945	}
				1946	EXPORT_SYMBOL_GPL(kvm_write_guest);
				1947
				1948	int kvm_vcpu_write_guest(struct kvm_vcpu vcpu, gpa_t gpa, const void data,
				1949	unsigned long len)
				1950	{
				1951	gfn_t gfn = gpa >> PAGE_SHIFT;
				1952	int seg;
				1953	int offset = offset_in_page(gpa);
				1954	int ret;
				1955
				1956	while ((seg = next_segment(len, offset)) != 0) {
				1957	ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
				1958	if (ret < 0)
				1959	return ret;
				1960	offset = 0;
				1961	len -= seg;
				1962	data += seg;
				1963	++gfn;
				1964	}
				1965	return 0;
				1966	}
				1967	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
				1968
				1969	static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				1970	struct gfn_to_hva_cache *ghc,
				1971	gpa_t gpa, unsigned long len)
				1972	{
				1973	int offset = offset_in_page(gpa);
				1974	gfn_t start_gfn = gpa >> PAGE_SHIFT;
				1975	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
				1976	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
				1977	gfn_t nr_pages_avail;
				1978
				1979	ghc->gpa = gpa;
				1980	ghc->generation = slots->generation;
				1981	ghc->len = len;
				1982	ghc->memslot = __gfn_to_memslot(slots, start_gfn);
				1983	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
				1984	if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
				1985	ghc->hva += offset;
				1986	} else {
				1987	/*
				1988	* If the requested region crosses two memslots, we still
				1989	* verify that the entire region is valid here.
				1990	*/
				1991	while (start_gfn <= end_gfn) {
				1992	nr_pages_avail = 0;
				1993	ghc->memslot = __gfn_to_memslot(slots, start_gfn);
				1994	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
				1995	&nr_pages_avail);
				1996	if (kvm_is_error_hva(ghc->hva))
				1997	return -EFAULT;
				1998	start_gfn += nr_pages_avail;
				1999	}
				2000	/* Use the slow path for cross page reads and writes. */
				2001	ghc->memslot = NULL;
				2002	}
				2003	return 0;
				2004	}
				2005
				2006	int kvm_gfn_to_hva_cache_init(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2007	gpa_t gpa, unsigned long len)
				2008	{
				2009	struct kvm_memslots *slots = kvm_memslots(kvm);
				2010	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
				2011	}
				2012	EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
				2013
				2014	int kvm_write_guest_offset_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2015	void *data, unsigned int offset,
				2016	unsigned long len)
				2017	{
				2018	struct kvm_memslots *slots = kvm_memslots(kvm);
				2019	int r;
				2020	gpa_t gpa = ghc->gpa + offset;
				2021
				2022	BUG_ON(len + offset > ghc->len);
				2023
				2024	if (slots->generation != ghc->generation)
				2025	__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
				2026
				2027	if (unlikely(!ghc->memslot))
				2028	return kvm_write_guest(kvm, gpa, data, len);
				2029
				2030	if (kvm_is_error_hva(ghc->hva))
				2031	return -EFAULT;
				2032
				2033	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
				2034	if (r)
				2035	return -EFAULT;
				2036	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
				2037
				2038	return 0;
				2039	}
				2040	EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
				2041
				2042	int kvm_write_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2043	void *data, unsigned long len)
				2044	{
				2045	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
				2046	}
				2047	EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
				2048
				2049	int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2050	void *data, unsigned long len)
				2051	{
				2052	struct kvm_memslots *slots = kvm_memslots(kvm);
				2053	int r;
				2054
				2055	BUG_ON(len > ghc->len);
				2056
				2057	if (slots->generation != ghc->generation)
				2058	__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
				2059
				2060	if (unlikely(!ghc->memslot))
				2061	return kvm_read_guest(kvm, ghc->gpa, data, len);
				2062
				2063	if (kvm_is_error_hva(ghc->hva))
				2064	return -EFAULT;
				2065
				2066	r = __copy_from_user(data, (void __user *)ghc->hva, len);
				2067	if (r)
				2068	return -EFAULT;
				2069
				2070	return 0;
				2071	}
				2072	EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
				2073
				2074	int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
				2075	{
				2076	const void zero_page = (const void ) __va(page_to_phys(ZERO_PAGE(0)));
				2077
				2078	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
				2079	}
				2080	EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
				2081
				2082	int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
				2083	{
				2084	gfn_t gfn = gpa >> PAGE_SHIFT;
				2085	int seg;
				2086	int offset = offset_in_page(gpa);
				2087	int ret;
				2088
				2089	while ((seg = next_segment(len, offset)) != 0) {
				2090	ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
				2091	if (ret < 0)
				2092	return ret;
				2093	offset = 0;
				2094	len -= seg;
				2095	++gfn;
				2096	}
				2097	return 0;
				2098	}
				2099	EXPORT_SYMBOL_GPL(kvm_clear_guest);
				2100
				2101	static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
				2102	gfn_t gfn)
				2103	{
				2104	if (memslot && memslot->dirty_bitmap) {
				2105	unsigned long rel_gfn = gfn - memslot->base_gfn;
				2106
				2107	set_bit_le(rel_gfn, memslot->dirty_bitmap);
				2108	}
				2109	}
				2110
				2111	void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
				2112	{
				2113	struct kvm_memory_slot *memslot;
				2114
				2115	memslot = gfn_to_memslot(kvm, gfn);
				2116	mark_page_dirty_in_slot(memslot, gfn);
				2117	}
				2118	EXPORT_SYMBOL_GPL(mark_page_dirty);
				2119
				2120	void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
				2121	{
				2122	struct kvm_memory_slot *memslot;
				2123
				2124	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				2125	mark_page_dirty_in_slot(memslot, gfn);
				2126	}
				2127	EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
				2128
				2129	void kvm_sigset_activate(struct kvm_vcpu *vcpu)
				2130	{
				2131	if (!vcpu->sigset_active)
				2132	return;
				2133
				2134	/*
				2135	* This does a lockless modification of ->real_blocked, which is fine
				2136	* because, only current can change ->real_blocked and all readers of
				2137	* ->real_blocked don't care as long ->real_blocked is always a subset
				2138	* of ->blocked.
				2139	*/
				2140	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
				2141	}
				2142
				2143	void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
				2144	{
				2145	if (!vcpu->sigset_active)
				2146	return;
				2147
				2148	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
				2149	sigemptyset(&current->real_blocked);
				2150	}
				2151
				2152	static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
				2153	{
				2154	unsigned int old, val, grow;
				2155
				2156	old = val = vcpu->halt_poll_ns;
				2157	grow = READ_ONCE(halt_poll_ns_grow);
				2158	/* 10us base */
				2159	if (val == 0 && grow)
				2160	val = 10000;
				2161	else
				2162	val *= grow;
				2163
				2164	if (val > halt_poll_ns)
				2165	val = halt_poll_ns;
				2166
				2167	vcpu->halt_poll_ns = val;
				2168	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
				2169	}
				2170
				2171	static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
				2172	{
				2173	unsigned int old, val, shrink;
				2174
				2175	old = val = vcpu->halt_poll_ns;
				2176	shrink = READ_ONCE(halt_poll_ns_shrink);
				2177	if (shrink == 0)
				2178	val = 0;
				2179	else
				2180	val /= shrink;
				2181
				2182	vcpu->halt_poll_ns = val;
				2183	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
				2184	}
				2185
				2186	static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
				2187	{
				2188	int ret = -EINTR;
				2189	int idx = srcu_read_lock(&vcpu->kvm->srcu);
				2190
				2191	if (kvm_arch_vcpu_runnable(vcpu)) {
				2192	kvm_make_request(KVM_REQ_UNHALT, vcpu);
				2193	goto out;
				2194	}
				2195	if (kvm_cpu_has_pending_timer(vcpu))
				2196	goto out;
				2197	if (signal_pending(current))
				2198	goto out;
				2199
				2200	ret = 0;
				2201	out:
				2202	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				2203	return ret;
				2204	}
				2205
				2206	/*
				2207	* The vCPU has executed a HLT instruction with in-kernel mode enabled.
				2208	*/
				2209	void kvm_vcpu_block(struct kvm_vcpu *vcpu)
				2210	{
				2211	ktime_t start, cur;
				2212	DECLARE_SWAITQUEUE(wait);
				2213	bool waited = false;
				2214	u64 block_ns;
				2215
				2216	start = cur = ktime_get();
				2217	if (vcpu->halt_poll_ns) {
				2218	ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
				2219
				2220	++vcpu->stat.halt_attempted_poll;
				2221	do {
				2222	/*
				2223	* This sets KVM_REQ_UNHALT if an interrupt
				2224	* arrives.
				2225	*/
				2226	if (kvm_vcpu_check_block(vcpu) < 0) {
				2227	++vcpu->stat.halt_successful_poll;
				2228	if (!vcpu_valid_wakeup(vcpu))
				2229	++vcpu->stat.halt_poll_invalid;
				2230	goto out;
				2231	}
				2232	cur = ktime_get();
				2233	} while (single_task_running() && ktime_before(cur, stop));
				2234	}
				2235
				2236	kvm_arch_vcpu_blocking(vcpu);
				2237
				2238	for (;;) {
				2239	prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
				2240
				2241	if (kvm_vcpu_check_block(vcpu) < 0)
				2242	break;
				2243
				2244	waited = true;
				2245	schedule();
				2246	}
				2247
				2248	finish_swait(&vcpu->wq, &wait);
				2249	cur = ktime_get();
				2250
				2251	kvm_arch_vcpu_unblocking(vcpu);
				2252	out:
				2253	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
				2254
				2255	if (!vcpu_valid_wakeup(vcpu))
				2256	shrink_halt_poll_ns(vcpu);
				2257	else if (halt_poll_ns) {
				2258	if (block_ns <= vcpu->halt_poll_ns)
				2259	;
				2260	/* we had a long block, shrink polling */
				2261	else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
				2262	shrink_halt_poll_ns(vcpu);
				2263	/* we had a short halt and our poll time is too small */
				2264	else if (vcpu->halt_poll_ns < halt_poll_ns &&
				2265	block_ns < halt_poll_ns)
				2266	grow_halt_poll_ns(vcpu);
				2267	} else
				2268	vcpu->halt_poll_ns = 0;
				2269
				2270	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
				2271	kvm_arch_vcpu_block_finish(vcpu);
				2272	}
				2273	EXPORT_SYMBOL_GPL(kvm_vcpu_block);
				2274
				2275	bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
				2276	{
				2277	struct swait_queue_head *wqp;
				2278
				2279	wqp = kvm_arch_vcpu_wq(vcpu);
				2280	if (swq_has_sleeper(wqp)) {
				2281	swake_up_one(wqp);
				2282	++vcpu->stat.halt_wakeup;
				2283	return true;
				2284	}
				2285
				2286	return false;
				2287	}
				2288	EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
				2289
				2290	#ifndef CONFIG_S390
				2291	/*
				2292	* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
				2293	*/
				2294	void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
				2295	{
				2296	int me;
				2297	int cpu = vcpu->cpu;
				2298
				2299	if (kvm_vcpu_wake_up(vcpu))
				2300	return;
				2301
				2302	me = get_cpu();
				2303	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
				2304	if (kvm_arch_vcpu_should_kick(vcpu))
				2305	smp_send_reschedule(cpu);
				2306	put_cpu();
				2307	}
				2308	EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
				2309	#endif /* !CONFIG_S390 */
				2310
				2311	int kvm_vcpu_yield_to(struct kvm_vcpu *target)
				2312	{
				2313	struct pid *pid;
				2314	struct task_struct *task = NULL;
				2315	int ret = 0;
				2316
				2317	rcu_read_lock();
				2318	pid = rcu_dereference(target->pid);
				2319	if (pid)
				2320	task = get_pid_task(pid, PIDTYPE_PID);
				2321	rcu_read_unlock();
				2322	if (!task)
				2323	return ret;
				2324	ret = yield_to(task, 1);
				2325	put_task_struct(task);
				2326
				2327	return ret;
				2328	}
				2329	EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
				2330
				2331	/*
				2332	* Helper that checks whether a VCPU is eligible for directed yield.
				2333	* Most eligible candidate to yield is decided by following heuristics:
				2334	*
				2335	* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
				2336	* (preempted lock holder), indicated by @in_spin_loop.
				2337	* Set at the beiginning and cleared at the end of interception/PLE handler.
				2338	*
				2339	* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
				2340	* chance last time (mostly it has become eligible now since we have probably
				2341	* yielded to lockholder in last iteration. This is done by toggling
				2342	* @dy_eligible each time a VCPU checked for eligibility.)
				2343	*
				2344	* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
				2345	* to preempted lock-holder could result in wrong VCPU selection and CPU
				2346	* burning. Giving priority for a potential lock-holder increases lock
				2347	* progress.
				2348	*
				2349	* Since algorithm is based on heuristics, accessing another VCPU data without
				2350	* locking does not harm. It may result in trying to yield to same VCPU, fail
				2351	* and continue with next VCPU and so on.
				2352	*/
				2353	static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
				2354	{
				2355	#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
				2356	bool eligible;
				2357
				2358	eligible = !vcpu->spin_loop.in_spin_loop \|\|
				2359	vcpu->spin_loop.dy_eligible;
				2360
				2361	if (vcpu->spin_loop.in_spin_loop)
				2362	kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
				2363
				2364	return eligible;
				2365	#else
				2366	return true;
				2367	#endif
				2368	}
				2369
				2370	/*
				2371	* Unlike kvm_arch_vcpu_runnable, this function is called outside
				2372	* a vcpu_load/vcpu_put pair. However, for most architectures
				2373	* kvm_arch_vcpu_runnable does not require vcpu_load.
				2374	*/
				2375	bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
				2376	{
				2377	return kvm_arch_vcpu_runnable(vcpu);
				2378	}
				2379
				2380	static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
				2381	{
				2382	if (kvm_arch_dy_runnable(vcpu))
				2383	return true;
				2384
				2385	#ifdef CONFIG_KVM_ASYNC_PF
				2386	if (!list_empty_careful(&vcpu->async_pf.done))
				2387	return true;
				2388	#endif
				2389
				2390	return false;
				2391	}
				2392
				2393	void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
				2394	{
				2395	struct kvm *kvm = me->kvm;
				2396	struct kvm_vcpu *vcpu;
				2397	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
				2398	int yielded = 0;
				2399	int try = 3;
				2400	int pass;
				2401	int i;
				2402
				2403	kvm_vcpu_set_in_spin_loop(me, true);
				2404	/*
				2405	* We boost the priority of a VCPU that is runnable but not
				2406	* currently running, because it got preempted by something
				2407	* else and called schedule in __vcpu_run. Hopefully that
				2408	* VCPU is holding the lock that we need and will release it.
				2409	* We approximate round-robin by starting at the last boosted VCPU.
				2410	*/
				2411	for (pass = 0; pass < 2 && !yielded && try; pass++) {
				2412	kvm_for_each_vcpu(i, vcpu, kvm) {
				2413	if (!pass && i <= last_boosted_vcpu) {
				2414	i = last_boosted_vcpu;
				2415	continue;
				2416	} else if (pass && i > last_boosted_vcpu)
				2417	break;
				2418	if (!READ_ONCE(vcpu->preempted))
				2419	continue;
				2420	if (vcpu == me)
				2421	continue;
				2422	if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
				2423	continue;
				2424	if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
				2425	continue;
				2426	if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
				2427	continue;
				2428
				2429	yielded = kvm_vcpu_yield_to(vcpu);
				2430	if (yielded > 0) {
				2431	kvm->last_boosted_vcpu = i;
				2432	break;
				2433	} else if (yielded < 0) {
				2434	try--;
				2435	if (!try)
				2436	break;
				2437	}
				2438	}
				2439	}
				2440	kvm_vcpu_set_in_spin_loop(me, false);
				2441
				2442	/* Ensure vcpu is not eligible during next spinloop */
				2443	kvm_vcpu_set_dy_eligible(me, false);
				2444	}
				2445	EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
				2446
				2447	static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
				2448	{
				2449	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
				2450	struct page *page;
				2451
				2452	if (vmf->pgoff == 0)
				2453	page = virt_to_page(vcpu->run);
				2454	#ifdef CONFIG_X86
				2455	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
				2456	page = virt_to_page(vcpu->arch.pio_data);
				2457	#endif
				2458	#ifdef CONFIG_KVM_MMIO
				2459	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
				2460	page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
				2461	#endif
				2462	else
				2463	return kvm_arch_vcpu_fault(vcpu, vmf);
				2464	get_page(page);
				2465	vmf->page = page;
				2466	return 0;
				2467	}
				2468
				2469	static const struct vm_operations_struct kvm_vcpu_vm_ops = {
				2470	.fault = kvm_vcpu_fault,
				2471	};
				2472
				2473	static int kvm_vcpu_mmap(struct file file, struct vm_area_struct vma)
				2474	{
				2475	vma->vm_ops = &kvm_vcpu_vm_ops;
				2476	return 0;
				2477	}
				2478
				2479	static int kvm_vcpu_release(struct inode inode, struct file filp)
				2480	{
				2481	struct kvm_vcpu *vcpu = filp->private_data;
				2482
				2483	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2484	kvm_put_kvm(vcpu->kvm);
				2485	return 0;
				2486	}
				2487
				2488	static struct file_operations kvm_vcpu_fops = {
				2489	.release = kvm_vcpu_release,
				2490	.unlocked_ioctl = kvm_vcpu_ioctl,
				2491	.mmap = kvm_vcpu_mmap,
				2492	.llseek = noop_llseek,
				2493	KVM_COMPAT(kvm_vcpu_compat_ioctl),
				2494	};
				2495
				2496	/*
				2497	* Allocates an inode for the vcpu.
				2498	*/
				2499	static int create_vcpu_fd(struct kvm_vcpu *vcpu)
				2500	{
				2501	char name[8 + 1 + ITOA_MAX_LEN + 1];
				2502
				2503	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
				2504	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR \| O_CLOEXEC);
				2505	}
				2506
				2507	static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
				2508	{
				2509	char dir_name[ITOA_MAX_LEN * 2];
				2510	int ret;
				2511
				2512	if (!kvm_arch_has_vcpu_debugfs())
				2513	return 0;
				2514
				2515	if (!debugfs_initialized())
				2516	return 0;
				2517
				2518	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
				2519	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
				2520	vcpu->kvm->debugfs_dentry);
				2521	if (!vcpu->debugfs_dentry)
				2522	return -ENOMEM;
				2523
				2524	ret = kvm_arch_create_vcpu_debugfs(vcpu);
				2525	if (ret < 0) {
				2526	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2527	return ret;
				2528	}
				2529
				2530	return 0;
				2531	}
				2532
				2533	/*
				2534	* Creates some virtual cpus. Good luck creating more than one.
				2535	*/
				2536	static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
				2537	{
				2538	int r;
				2539	struct kvm_vcpu *vcpu;
				2540
				2541	if (id >= KVM_MAX_VCPU_ID)
				2542	return -EINVAL;
				2543
				2544	mutex_lock(&kvm->lock);
				2545	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
				2546	mutex_unlock(&kvm->lock);
				2547	return -EINVAL;
				2548	}
				2549
				2550	kvm->created_vcpus++;
				2551	mutex_unlock(&kvm->lock);
				2552
				2553	vcpu = kvm_arch_vcpu_create(kvm, id);
				2554	if (IS_ERR(vcpu)) {
				2555	r = PTR_ERR(vcpu);
				2556	goto vcpu_decrement;
				2557	}
				2558
				2559	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
				2560
				2561	r = kvm_arch_vcpu_setup(vcpu);
				2562	if (r)
				2563	goto vcpu_destroy;
				2564
				2565	r = kvm_create_vcpu_debugfs(vcpu);
				2566	if (r)
				2567	goto vcpu_destroy;
				2568
				2569	mutex_lock(&kvm->lock);
				2570	if (kvm_get_vcpu_by_id(kvm, id)) {
				2571	r = -EEXIST;
				2572	goto unlock_vcpu_destroy;
				2573	}
				2574
				2575	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
				2576
				2577	/* Now it's all set up, let userspace reach it */
				2578	kvm_get_kvm(kvm);
				2579	r = create_vcpu_fd(vcpu);
				2580	if (r < 0) {
				2581	kvm_put_kvm(kvm);
				2582	goto unlock_vcpu_destroy;
				2583	}
				2584
				2585	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
				2586
				2587	/*
				2588	* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
				2589	* before kvm->online_vcpu's incremented value.
				2590	*/
				2591	smp_wmb();
				2592	atomic_inc(&kvm->online_vcpus);
				2593
				2594	mutex_unlock(&kvm->lock);
				2595	kvm_arch_vcpu_postcreate(vcpu);
				2596	return r;
				2597
				2598	unlock_vcpu_destroy:
				2599	mutex_unlock(&kvm->lock);
				2600	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2601	vcpu_destroy:
				2602	kvm_arch_vcpu_destroy(vcpu);
				2603	vcpu_decrement:
				2604	mutex_lock(&kvm->lock);
				2605	kvm->created_vcpus--;
				2606	mutex_unlock(&kvm->lock);
				2607	return r;
				2608	}
				2609
				2610	static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu vcpu, sigset_t sigset)
				2611	{
				2612	if (sigset) {
				2613	sigdelsetmask(sigset, sigmask(SIGKILL)\|sigmask(SIGSTOP));
				2614	vcpu->sigset_active = 1;
				2615	vcpu->sigset = *sigset;
				2616	} else
				2617	vcpu->sigset_active = 0;
				2618	return 0;
				2619	}
				2620
				2621	static long kvm_vcpu_ioctl(struct file *filp,
				2622	unsigned int ioctl, unsigned long arg)
				2623	{
				2624	struct kvm_vcpu *vcpu = filp->private_data;
				2625	void __user argp = (void __user )arg;
				2626	int r;
				2627	struct kvm_fpu *fpu = NULL;
				2628	struct kvm_sregs *kvm_sregs = NULL;
				2629
				2630	if (vcpu->kvm->mm != current->mm)
				2631	return -EIO;
				2632
				2633	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
				2634	return -EINVAL;
				2635
				2636	/*
				2637	* Some architectures have vcpu ioctls that are asynchronous to vcpu
				2638	* execution; mutex_lock() would break them.
				2639	*/
				2640	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
				2641	if (r != -ENOIOCTLCMD)
				2642	return r;
				2643
				2644	if (mutex_lock_killable(&vcpu->mutex))
				2645	return -EINTR;
				2646	switch (ioctl) {
				2647	case KVM_RUN: {
				2648	struct pid *oldpid;
				2649	r = -EINVAL;
				2650	if (arg)
				2651	goto out;
				2652	oldpid = rcu_access_pointer(vcpu->pid);
				2653	if (unlikely(oldpid != task_pid(current))) {
				2654	/* The thread running this VCPU changed. */
				2655	struct pid *newpid;
				2656
				2657	r = kvm_arch_vcpu_run_pid_change(vcpu);
				2658	if (r)
				2659	break;
				2660
				2661	newpid = get_task_pid(current, PIDTYPE_PID);
				2662	rcu_assign_pointer(vcpu->pid, newpid);
				2663	if (oldpid)
				2664	synchronize_rcu();
				2665	put_pid(oldpid);
				2666	}
				2667	r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
				2668	trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
				2669	break;
				2670	}
				2671	case KVM_GET_REGS: {
				2672	struct kvm_regs *kvm_regs;
				2673
				2674	r = -ENOMEM;
				2675	kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
				2676	if (!kvm_regs)
				2677	goto out;
				2678	r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
				2679	if (r)
				2680	goto out_free1;
				2681	r = -EFAULT;
				2682	if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
				2683	goto out_free1;
				2684	r = 0;
				2685	out_free1:
				2686	kfree(kvm_regs);
				2687	break;
				2688	}
				2689	case KVM_SET_REGS: {
				2690	struct kvm_regs *kvm_regs;
				2691
				2692	r = -ENOMEM;
				2693	kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
				2694	if (IS_ERR(kvm_regs)) {
				2695	r = PTR_ERR(kvm_regs);
				2696	goto out;
				2697	}
				2698	r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
				2699	kfree(kvm_regs);
				2700	break;
				2701	}
				2702	case KVM_GET_SREGS: {
				2703	kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
				2704	r = -ENOMEM;
				2705	if (!kvm_sregs)
				2706	goto out;
				2707	r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
				2708	if (r)
				2709	goto out;
				2710	r = -EFAULT;
				2711	if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
				2712	goto out;
				2713	r = 0;
				2714	break;
				2715	}
				2716	case KVM_SET_SREGS: {
				2717	kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
				2718	if (IS_ERR(kvm_sregs)) {
				2719	r = PTR_ERR(kvm_sregs);
				2720	kvm_sregs = NULL;
				2721	goto out;
				2722	}
				2723	r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
				2724	break;
				2725	}
				2726	case KVM_GET_MP_STATE: {
				2727	struct kvm_mp_state mp_state;
				2728
				2729	r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
				2730	if (r)
				2731	goto out;
				2732	r = -EFAULT;
				2733	if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
				2734	goto out;
				2735	r = 0;
				2736	break;
				2737	}
				2738	case KVM_SET_MP_STATE: {
				2739	struct kvm_mp_state mp_state;
				2740
				2741	r = -EFAULT;
				2742	if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
				2743	goto out;
				2744	r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
				2745	break;
				2746	}
				2747	case KVM_TRANSLATE: {
				2748	struct kvm_translation tr;
				2749
				2750	r = -EFAULT;
				2751	if (copy_from_user(&tr, argp, sizeof(tr)))
				2752	goto out;
				2753	r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
				2754	if (r)
				2755	goto out;
				2756	r = -EFAULT;
				2757	if (copy_to_user(argp, &tr, sizeof(tr)))
				2758	goto out;
				2759	r = 0;
				2760	break;
				2761	}
				2762	case KVM_SET_GUEST_DEBUG: {
				2763	struct kvm_guest_debug dbg;
				2764
				2765	r = -EFAULT;
				2766	if (copy_from_user(&dbg, argp, sizeof(dbg)))
				2767	goto out;
				2768	r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
				2769	break;
				2770	}
				2771	case KVM_SET_SIGNAL_MASK: {
				2772	struct kvm_signal_mask __user *sigmask_arg = argp;
				2773	struct kvm_signal_mask kvm_sigmask;
				2774	sigset_t sigset, *p;
				2775
				2776	p = NULL;
				2777	if (argp) {
				2778	r = -EFAULT;
				2779	if (copy_from_user(&kvm_sigmask, argp,
				2780	sizeof(kvm_sigmask)))
				2781	goto out;
				2782	r = -EINVAL;
				2783	if (kvm_sigmask.len != sizeof(sigset))
				2784	goto out;
				2785	r = -EFAULT;
				2786	if (copy_from_user(&sigset, sigmask_arg->sigset,
				2787	sizeof(sigset)))
				2788	goto out;
				2789	p = &sigset;
				2790	}
				2791	r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
				2792	break;
				2793	}
				2794	case KVM_GET_FPU: {
				2795	fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
				2796	r = -ENOMEM;
				2797	if (!fpu)
				2798	goto out;
				2799	r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
				2800	if (r)
				2801	goto out;
				2802	r = -EFAULT;
				2803	if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
				2804	goto out;
				2805	r = 0;
				2806	break;
				2807	}
				2808	case KVM_SET_FPU: {
				2809	fpu = memdup_user(argp, sizeof(*fpu));
				2810	if (IS_ERR(fpu)) {
				2811	r = PTR_ERR(fpu);
				2812	fpu = NULL;
				2813	goto out;
				2814	}
				2815	r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
				2816	break;
				2817	}
				2818	default:
				2819	r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
				2820	}
				2821	out:
				2822	mutex_unlock(&vcpu->mutex);
				2823	kfree(fpu);
				2824	kfree(kvm_sregs);
				2825	return r;
				2826	}
				2827
				2828	#ifdef CONFIG_KVM_COMPAT
				2829	static long kvm_vcpu_compat_ioctl(struct file *filp,
				2830	unsigned int ioctl, unsigned long arg)
				2831	{
				2832	struct kvm_vcpu *vcpu = filp->private_data;
				2833	void __user *argp = compat_ptr(arg);
				2834	int r;
				2835
				2836	if (vcpu->kvm->mm != current->mm)
				2837	return -EIO;
				2838
				2839	switch (ioctl) {
				2840	case KVM_SET_SIGNAL_MASK: {
				2841	struct kvm_signal_mask __user *sigmask_arg = argp;
				2842	struct kvm_signal_mask kvm_sigmask;
				2843	sigset_t sigset;
				2844
				2845	if (argp) {
				2846	r = -EFAULT;
				2847	if (copy_from_user(&kvm_sigmask, argp,
				2848	sizeof(kvm_sigmask)))
				2849	goto out;
				2850	r = -EINVAL;
				2851	if (kvm_sigmask.len != sizeof(compat_sigset_t))
				2852	goto out;
				2853	r = -EFAULT;
				2854	if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
				2855	goto out;
				2856	r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
				2857	} else
				2858	r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
				2859	break;
				2860	}
				2861	default:
				2862	r = kvm_vcpu_ioctl(filp, ioctl, arg);
				2863	}
				2864
				2865	out:
				2866	return r;
				2867	}
				2868	#endif
				2869
				2870	static int kvm_device_ioctl_attr(struct kvm_device *dev,
				2871	int (accessor)(struct kvm_device dev,
				2872	struct kvm_device_attr *attr),
				2873	unsigned long arg)
				2874	{
				2875	struct kvm_device_attr attr;
				2876
				2877	if (!accessor)
				2878	return -EPERM;
				2879
				2880	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
				2881	return -EFAULT;
				2882
				2883	return accessor(dev, &attr);
				2884	}
				2885
				2886	static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
				2887	unsigned long arg)
				2888	{
				2889	struct kvm_device *dev = filp->private_data;
				2890
				2891	if (dev->kvm->mm != current->mm)
				2892	return -EIO;
				2893
				2894	switch (ioctl) {
				2895	case KVM_SET_DEVICE_ATTR:
				2896	return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
				2897	case KVM_GET_DEVICE_ATTR:
				2898	return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
				2899	case KVM_HAS_DEVICE_ATTR:
				2900	return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
				2901	default:
				2902	if (dev->ops->ioctl)
				2903	return dev->ops->ioctl(dev, ioctl, arg);
				2904
				2905	return -ENOTTY;
				2906	}
				2907	}
				2908
				2909	static int kvm_device_release(struct inode inode, struct file filp)
				2910	{
				2911	struct kvm_device *dev = filp->private_data;
				2912	struct kvm *kvm = dev->kvm;
				2913
				2914	kvm_put_kvm(kvm);
				2915	return 0;
				2916	}
				2917
				2918	static const struct file_operations kvm_device_fops = {
				2919	.unlocked_ioctl = kvm_device_ioctl,
				2920	.release = kvm_device_release,
				2921	KVM_COMPAT(kvm_device_ioctl),
				2922	};
				2923
				2924	struct kvm_device kvm_device_from_filp(struct file filp)
				2925	{
				2926	if (filp->f_op != &kvm_device_fops)
				2927	return NULL;
				2928
				2929	return filp->private_data;
				2930	}
				2931
				2932	static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
				2933	#ifdef CONFIG_KVM_MPIC
				2934	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
				2935	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
				2936	#endif
				2937	};
				2938
				2939	int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
				2940	{
				2941	if (type >= ARRAY_SIZE(kvm_device_ops_table))
				2942	return -ENOSPC;
				2943
				2944	if (kvm_device_ops_table[type] != NULL)
				2945	return -EEXIST;
				2946
				2947	kvm_device_ops_table[type] = ops;
				2948	return 0;
				2949	}
				2950
				2951	void kvm_unregister_device_ops(u32 type)
				2952	{
				2953	if (kvm_device_ops_table[type] != NULL)
				2954	kvm_device_ops_table[type] = NULL;
				2955	}
				2956
				2957	static int kvm_ioctl_create_device(struct kvm *kvm,
				2958	struct kvm_create_device *cd)
				2959	{
				2960	struct kvm_device_ops *ops = NULL;
				2961	struct kvm_device *dev;
				2962	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
				2963	int type;
				2964	int ret;
				2965
				2966	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
				2967	return -ENODEV;
				2968
				2969	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
				2970	ops = kvm_device_ops_table[type];
				2971	if (ops == NULL)
				2972	return -ENODEV;
				2973
				2974	if (test)
				2975	return 0;
				2976
				2977	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
				2978	if (!dev)
				2979	return -ENOMEM;
				2980
				2981	dev->ops = ops;
				2982	dev->kvm = kvm;
				2983
				2984	mutex_lock(&kvm->lock);
				2985	ret = ops->create(dev, type);
				2986	if (ret < 0) {
				2987	mutex_unlock(&kvm->lock);
				2988	kfree(dev);
				2989	return ret;
				2990	}
				2991	list_add(&dev->vm_node, &kvm->devices);
				2992	mutex_unlock(&kvm->lock);
				2993
				2994	if (ops->init)
				2995	ops->init(dev);
				2996
				2997	kvm_get_kvm(kvm);
				2998	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR \| O_CLOEXEC);
				2999	if (ret < 0) {
				3000	kvm_put_kvm(kvm);
				3001	mutex_lock(&kvm->lock);
				3002	list_del(&dev->vm_node);
				3003	mutex_unlock(&kvm->lock);
				3004	ops->destroy(dev);
				3005	return ret;
				3006	}
				3007
				3008	cd->fd = ret;
				3009	return 0;
				3010	}
				3011
				3012	static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
				3013	{
				3014	switch (arg) {
				3015	case KVM_CAP_USER_MEMORY:
				3016	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
				3017	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
				3018	case KVM_CAP_INTERNAL_ERROR_DATA:
				3019	#ifdef CONFIG_HAVE_KVM_MSI
				3020	case KVM_CAP_SIGNAL_MSI:
				3021	#endif
				3022	#ifdef CONFIG_HAVE_KVM_IRQFD
				3023	case KVM_CAP_IRQFD:
				3024	case KVM_CAP_IRQFD_RESAMPLE:
				3025	#endif
				3026	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
				3027	case KVM_CAP_CHECK_EXTENSION_VM:
				3028	return 1;
				3029	#ifdef CONFIG_KVM_MMIO
				3030	case KVM_CAP_COALESCED_MMIO:
				3031	return KVM_COALESCED_MMIO_PAGE_OFFSET;
				3032	#endif
				3033	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
				3034	case KVM_CAP_IRQ_ROUTING:
				3035	return KVM_MAX_IRQ_ROUTES;
				3036	#endif
				3037	#if KVM_ADDRESS_SPACE_NUM > 1
				3038	case KVM_CAP_MULTI_ADDRESS_SPACE:
				3039	return KVM_ADDRESS_SPACE_NUM;
				3040	#endif
				3041	default:
				3042	break;
				3043	}
				3044	return kvm_vm_ioctl_check_extension(kvm, arg);
				3045	}
				3046
				3047	static long kvm_vm_ioctl(struct file *filp,
				3048	unsigned int ioctl, unsigned long arg)
				3049	{
				3050	struct kvm *kvm = filp->private_data;
				3051	void __user argp = (void __user )arg;
				3052	int r;
				3053
				3054	if (kvm->mm != current->mm)
				3055	return -EIO;
				3056	switch (ioctl) {
				3057	case KVM_CREATE_VCPU:
				3058	r = kvm_vm_ioctl_create_vcpu(kvm, arg);
				3059	break;
				3060	case KVM_SET_USER_MEMORY_REGION: {
				3061	struct kvm_userspace_memory_region kvm_userspace_mem;
				3062
				3063	r = -EFAULT;
				3064	if (copy_from_user(&kvm_userspace_mem, argp,
				3065	sizeof(kvm_userspace_mem)))
				3066	goto out;
				3067
				3068	r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
				3069	break;
				3070	}
				3071	case KVM_GET_DIRTY_LOG: {
				3072	struct kvm_dirty_log log;
				3073
				3074	r = -EFAULT;
				3075	if (copy_from_user(&log, argp, sizeof(log)))
				3076	goto out;
				3077	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
				3078	break;
				3079	}
				3080	#ifdef CONFIG_KVM_MMIO
				3081	case KVM_REGISTER_COALESCED_MMIO: {
				3082	struct kvm_coalesced_mmio_zone zone;
				3083
				3084	r = -EFAULT;
				3085	if (copy_from_user(&zone, argp, sizeof(zone)))
				3086	goto out;
				3087	r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
				3088	break;
				3089	}
				3090	case KVM_UNREGISTER_COALESCED_MMIO: {
				3091	struct kvm_coalesced_mmio_zone zone;
				3092
				3093	r = -EFAULT;
				3094	if (copy_from_user(&zone, argp, sizeof(zone)))
				3095	goto out;
				3096	r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
				3097	break;
				3098	}
				3099	#endif
				3100	case KVM_IRQFD: {
				3101	struct kvm_irqfd data;
				3102
				3103	r = -EFAULT;
				3104	if (copy_from_user(&data, argp, sizeof(data)))
				3105	goto out;
				3106	r = kvm_irqfd(kvm, &data);
				3107	break;
				3108	}
				3109	case KVM_IOEVENTFD: {
				3110	struct kvm_ioeventfd data;
				3111
				3112	r = -EFAULT;
				3113	if (copy_from_user(&data, argp, sizeof(data)))
				3114	goto out;
				3115	r = kvm_ioeventfd(kvm, &data);
				3116	break;
				3117	}
				3118	#ifdef CONFIG_HAVE_KVM_MSI
				3119	case KVM_SIGNAL_MSI: {
				3120	struct kvm_msi msi;
				3121
				3122	r = -EFAULT;
				3123	if (copy_from_user(&msi, argp, sizeof(msi)))
				3124	goto out;
				3125	r = kvm_send_userspace_msi(kvm, &msi);
				3126	break;
				3127	}
				3128	#endif
				3129	#ifdef __KVM_HAVE_IRQ_LINE
				3130	case KVM_IRQ_LINE_STATUS:
				3131	case KVM_IRQ_LINE: {
				3132	struct kvm_irq_level irq_event;
				3133
				3134	r = -EFAULT;
				3135	if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
				3136	goto out;
				3137
				3138	r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
				3139	ioctl == KVM_IRQ_LINE_STATUS);
				3140	if (r)
				3141	goto out;
				3142
				3143	r = -EFAULT;
				3144	if (ioctl == KVM_IRQ_LINE_STATUS) {
				3145	if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
				3146	goto out;
				3147	}
				3148
				3149	r = 0;
				3150	break;
				3151	}
				3152	#endif
				3153	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
				3154	case KVM_SET_GSI_ROUTING: {
				3155	struct kvm_irq_routing routing;
				3156	struct kvm_irq_routing __user *urouting;
				3157	struct kvm_irq_routing_entry *entries = NULL;
				3158
				3159	r = -EFAULT;
				3160	if (copy_from_user(&routing, argp, sizeof(routing)))
				3161	goto out;
				3162	r = -EINVAL;
				3163	if (!kvm_arch_can_set_irq_routing(kvm))
				3164	goto out;
				3165	if (routing.nr > KVM_MAX_IRQ_ROUTES)
				3166	goto out;
				3167	if (routing.flags)
				3168	goto out;
				3169	if (routing.nr) {
				3170	r = -ENOMEM;
				3171	entries = vmalloc(array_size(sizeof(*entries),
				3172	routing.nr));
				3173	if (!entries)
				3174	goto out;
				3175	r = -EFAULT;
				3176	urouting = argp;
				3177	if (copy_from_user(entries, urouting->entries,
				3178	routing.nr * sizeof(*entries)))
				3179	goto out_free_irq_routing;
				3180	}
				3181	r = kvm_set_irq_routing(kvm, entries, routing.nr,
				3182	routing.flags);
				3183	out_free_irq_routing:
				3184	vfree(entries);
				3185	break;
				3186	}
				3187	#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
				3188	case KVM_CREATE_DEVICE: {
				3189	struct kvm_create_device cd;
				3190
				3191	r = -EFAULT;
				3192	if (copy_from_user(&cd, argp, sizeof(cd)))
				3193	goto out;
				3194
				3195	r = kvm_ioctl_create_device(kvm, &cd);
				3196	if (r)
				3197	goto out;
				3198
				3199	r = -EFAULT;
				3200	if (copy_to_user(argp, &cd, sizeof(cd)))
				3201	goto out;
				3202
				3203	r = 0;
				3204	break;
				3205	}
				3206	case KVM_CHECK_EXTENSION:
				3207	r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
				3208	break;
				3209	default:
				3210	r = kvm_arch_vm_ioctl(filp, ioctl, arg);
				3211	}
				3212	out:
				3213	return r;
				3214	}
				3215
				3216	#ifdef CONFIG_KVM_COMPAT
				3217	struct compat_kvm_dirty_log {
				3218	__u32 slot;
				3219	__u32 padding1;
				3220	union {
				3221	compat_uptr_t dirty_bitmap; /* one bit per page */
				3222	__u64 padding2;
				3223	};
				3224	};
				3225
				3226	static long kvm_vm_compat_ioctl(struct file *filp,
				3227	unsigned int ioctl, unsigned long arg)
				3228	{
				3229	struct kvm *kvm = filp->private_data;
				3230	int r;
				3231
				3232	if (kvm->mm != current->mm)
				3233	return -EIO;
				3234	switch (ioctl) {
				3235	case KVM_GET_DIRTY_LOG: {
				3236	struct compat_kvm_dirty_log compat_log;
				3237	struct kvm_dirty_log log;
				3238
				3239	if (copy_from_user(&compat_log, (void __user *)arg,
				3240	sizeof(compat_log)))
				3241	return -EFAULT;
				3242	log.slot = compat_log.slot;
				3243	log.padding1 = compat_log.padding1;
				3244	log.padding2 = compat_log.padding2;
				3245	log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
				3246
				3247	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
				3248	break;
				3249	}
				3250	default:
				3251	r = kvm_vm_ioctl(filp, ioctl, arg);
				3252	}
				3253	return r;
				3254	}
				3255	#endif
				3256
				3257	static struct file_operations kvm_vm_fops = {
				3258	.release = kvm_vm_release,
				3259	.unlocked_ioctl = kvm_vm_ioctl,
				3260	.llseek = noop_llseek,
				3261	KVM_COMPAT(kvm_vm_compat_ioctl),
				3262	};
				3263
				3264	static int kvm_dev_ioctl_create_vm(unsigned long type)
				3265	{
				3266	int r;
				3267	struct kvm *kvm;
				3268	struct file *file;
				3269
				3270	kvm = kvm_create_vm(type);
				3271	if (IS_ERR(kvm))
				3272	return PTR_ERR(kvm);
				3273	#ifdef CONFIG_KVM_MMIO
				3274	r = kvm_coalesced_mmio_init(kvm);
				3275	if (r < 0)
				3276	goto put_kvm;
				3277	#endif
				3278	r = get_unused_fd_flags(O_CLOEXEC);
				3279	if (r < 0)
				3280	goto put_kvm;
				3281
				3282	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
				3283	if (IS_ERR(file)) {
				3284	put_unused_fd(r);
				3285	r = PTR_ERR(file);
				3286	goto put_kvm;
				3287	}
				3288
				3289	/*
				3290	* Don't call kvm_put_kvm anymore at this point; file->f_op is
				3291	* already set, with ->release() being kvm_vm_release(). In error
				3292	* cases it will be called by the final fput(file) and will take
				3293	* care of doing kvm_put_kvm(kvm).
				3294	*/
				3295	if (kvm_create_vm_debugfs(kvm, r) < 0) {
				3296	put_unused_fd(r);
				3297	fput(file);
				3298	return -ENOMEM;
				3299	}
				3300	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
				3301
				3302	fd_install(r, file);
				3303	return r;
				3304
				3305	put_kvm:
				3306	kvm_put_kvm(kvm);
				3307	return r;
				3308	}
				3309
				3310	static long kvm_dev_ioctl(struct file *filp,
				3311	unsigned int ioctl, unsigned long arg)
				3312	{
				3313	long r = -EINVAL;
				3314
				3315	switch (ioctl) {
				3316	case KVM_GET_API_VERSION:
				3317	if (arg)
				3318	goto out;
				3319	r = KVM_API_VERSION;
				3320	break;
				3321	case KVM_CREATE_VM:
				3322	r = kvm_dev_ioctl_create_vm(arg);
				3323	break;
				3324	case KVM_CHECK_EXTENSION:
				3325	r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
				3326	break;
				3327	case KVM_GET_VCPU_MMAP_SIZE:
				3328	if (arg)
				3329	goto out;
				3330	r = PAGE_SIZE; /* struct kvm_run */
				3331	#ifdef CONFIG_X86
				3332	r += PAGE_SIZE; /* pio data page */
				3333	#endif
				3334	#ifdef CONFIG_KVM_MMIO
				3335	r += PAGE_SIZE; /* coalesced mmio ring page */
				3336	#endif
				3337	break;
				3338	case KVM_TRACE_ENABLE:
				3339	case KVM_TRACE_PAUSE:
				3340	case KVM_TRACE_DISABLE:
				3341	r = -EOPNOTSUPP;
				3342	break;
				3343	default:
				3344	return kvm_arch_dev_ioctl(filp, ioctl, arg);
				3345	}
				3346	out:
				3347	return r;
				3348	}
				3349
				3350	static struct file_operations kvm_chardev_ops = {
				3351	.unlocked_ioctl = kvm_dev_ioctl,
				3352	.llseek = noop_llseek,
				3353	KVM_COMPAT(kvm_dev_ioctl),
				3354	};
				3355
				3356	static struct miscdevice kvm_dev = {
				3357	KVM_MINOR,
				3358	"kvm",
				3359	&kvm_chardev_ops,
				3360	};
				3361
				3362	static void hardware_enable_nolock(void *junk)
				3363	{
				3364	int cpu = raw_smp_processor_id();
				3365	int r;
				3366
				3367	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
				3368	return;
				3369
				3370	cpumask_set_cpu(cpu, cpus_hardware_enabled);
				3371
				3372	r = kvm_arch_hardware_enable();
				3373
				3374	if (r) {
				3375	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
				3376	atomic_inc(&hardware_enable_failed);
				3377	pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
				3378	}
				3379	}
				3380
				3381	static int kvm_starting_cpu(unsigned int cpu)
				3382	{
				3383	raw_spin_lock(&kvm_count_lock);
				3384	if (kvm_usage_count)
				3385	hardware_enable_nolock(NULL);
				3386	raw_spin_unlock(&kvm_count_lock);
				3387	return 0;
				3388	}
				3389
				3390	static void hardware_disable_nolock(void *junk)
				3391	{
				3392	int cpu = raw_smp_processor_id();
				3393
				3394	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
				3395	return;
				3396	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
				3397	kvm_arch_hardware_disable();
				3398	}
				3399
				3400	static int kvm_dying_cpu(unsigned int cpu)
				3401	{
				3402	raw_spin_lock(&kvm_count_lock);
				3403	if (kvm_usage_count)
				3404	hardware_disable_nolock(NULL);
				3405	raw_spin_unlock(&kvm_count_lock);
				3406	return 0;
				3407	}
				3408
				3409	static void hardware_disable_all_nolock(void)
				3410	{
				3411	BUG_ON(!kvm_usage_count);
				3412
				3413	kvm_usage_count--;
				3414	if (!kvm_usage_count)
				3415	on_each_cpu(hardware_disable_nolock, NULL, 1);
				3416	}
				3417
				3418	static void hardware_disable_all(void)
				3419	{
				3420	raw_spin_lock(&kvm_count_lock);
				3421	hardware_disable_all_nolock();
				3422	raw_spin_unlock(&kvm_count_lock);
				3423	}
				3424
				3425	static int hardware_enable_all(void)
				3426	{
				3427	int r = 0;
				3428
				3429	raw_spin_lock(&kvm_count_lock);
				3430
				3431	kvm_usage_count++;
				3432	if (kvm_usage_count == 1) {
				3433	atomic_set(&hardware_enable_failed, 0);
				3434	on_each_cpu(hardware_enable_nolock, NULL, 1);
				3435
				3436	if (atomic_read(&hardware_enable_failed)) {
				3437	hardware_disable_all_nolock();
				3438	r = -EBUSY;
				3439	}
				3440	}
				3441
				3442	raw_spin_unlock(&kvm_count_lock);
				3443
				3444	return r;
				3445	}
				3446
				3447	static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
				3448	void *v)
				3449	{
				3450	/*
				3451	* Some (well, at least mine) BIOSes hang on reboot if
				3452	* in vmx root mode.
				3453	*
				3454	* And Intel TXT required VMX off for all cpu when system shutdown.
				3455	*/
				3456	pr_info("kvm: exiting hardware virtualization\n");
				3457	kvm_rebooting = true;
				3458	on_each_cpu(hardware_disable_nolock, NULL, 1);
				3459	return NOTIFY_OK;
				3460	}
				3461
				3462	static struct notifier_block kvm_reboot_notifier = {
				3463	.notifier_call = kvm_reboot,
				3464	.priority = 0,
				3465	};
				3466
				3467	static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
				3468	{
				3469	int i;
				3470
				3471	for (i = 0; i < bus->dev_count; i++) {
				3472	struct kvm_io_device *pos = bus->range[i].dev;
				3473
				3474	kvm_iodevice_destructor(pos);
				3475	}
				3476	kfree(bus);
				3477	}
				3478
				3479	static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
				3480	const struct kvm_io_range *r2)
				3481	{
				3482	gpa_t addr1 = r1->addr;
				3483	gpa_t addr2 = r2->addr;
				3484
				3485	if (addr1 < addr2)
				3486	return -1;
				3487
				3488	/* If r2->len == 0, match the exact address. If r2->len != 0,
				3489	* accept any overlapping write. Any order is acceptable for
				3490	* overlapping ranges, because kvm_io_bus_get_first_dev ensures
				3491	* we process all of them.
				3492	*/
				3493	if (r2->len) {
				3494	addr1 += r1->len;
				3495	addr2 += r2->len;
				3496	}
				3497
				3498	if (addr1 > addr2)
				3499	return 1;
				3500
				3501	return 0;
				3502	}
				3503
				3504	static int kvm_io_bus_sort_cmp(const void p1, const void p2)
				3505	{
				3506	return kvm_io_bus_cmp(p1, p2);
				3507	}
				3508
				3509	static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
				3510	gpa_t addr, int len)
				3511	{
				3512	struct kvm_io_range *range, key;
				3513	int off;
				3514
				3515	key = (struct kvm_io_range) {
				3516	.addr = addr,
				3517	.len = len,
				3518	};
				3519
				3520	range = bsearch(&key, bus->range, bus->dev_count,
				3521	sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
				3522	if (range == NULL)
				3523	return -ENOENT;
				3524
				3525	off = range - bus->range;
				3526
				3527	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
				3528	off--;
				3529
				3530	return off;
				3531	}
				3532
				3533	static int __kvm_io_bus_write(struct kvm_vcpu vcpu, struct kvm_io_bus bus,
				3534	struct kvm_io_range range, const void val)
				3535	{
				3536	int idx;
				3537
				3538	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
				3539	if (idx < 0)
				3540	return -EOPNOTSUPP;
				3541
				3542	while (idx < bus->dev_count &&
				3543	kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
				3544	if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
				3545	range->len, val))
				3546	return idx;
				3547	idx++;
				3548	}
				3549
				3550	return -EOPNOTSUPP;
				3551	}
				3552
				3553	/* kvm_io_bus_write - called under kvm->slots_lock */
				3554	int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
				3555	int len, const void *val)
				3556	{
				3557	struct kvm_io_bus *bus;
				3558	struct kvm_io_range range;
				3559	int r;
				3560
				3561	range = (struct kvm_io_range) {
				3562	.addr = addr,
				3563	.len = len,
				3564	};
				3565
				3566	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3567	if (!bus)
				3568	return -ENOMEM;
				3569	r = __kvm_io_bus_write(vcpu, bus, &range, val);
				3570	return r < 0 ? r : 0;
				3571	}
				3572
				3573	/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
				3574	int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
				3575	gpa_t addr, int len, const void *val, long cookie)
				3576	{
				3577	struct kvm_io_bus *bus;
				3578	struct kvm_io_range range;
				3579
				3580	range = (struct kvm_io_range) {
				3581	.addr = addr,
				3582	.len = len,
				3583	};
				3584
				3585	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3586	if (!bus)
				3587	return -ENOMEM;
				3588
				3589	/* First try the device referenced by cookie. */
				3590	if ((cookie >= 0) && (cookie < bus->dev_count) &&
				3591	(kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
				3592	if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
				3593	val))
				3594	return cookie;
				3595
				3596	/*
				3597	* cookie contained garbage; fall back to search and return the
				3598	* correct cookie value.
				3599	*/
				3600	return __kvm_io_bus_write(vcpu, bus, &range, val);
				3601	}
				3602
				3603	static int __kvm_io_bus_read(struct kvm_vcpu vcpu, struct kvm_io_bus bus,
				3604	struct kvm_io_range range, void val)
				3605	{
				3606	int idx;
				3607
				3608	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
				3609	if (idx < 0)
				3610	return -EOPNOTSUPP;
				3611
				3612	while (idx < bus->dev_count &&
				3613	kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
				3614	if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
				3615	range->len, val))
				3616	return idx;
				3617	idx++;
				3618	}
				3619
				3620	return -EOPNOTSUPP;
				3621	}
				3622	EXPORT_SYMBOL_GPL(kvm_io_bus_write);
				3623
				3624	/* kvm_io_bus_read - called under kvm->slots_lock */
				3625	int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
				3626	int len, void *val)
				3627	{
				3628	struct kvm_io_bus *bus;
				3629	struct kvm_io_range range;
				3630	int r;
				3631
				3632	range = (struct kvm_io_range) {
				3633	.addr = addr,
				3634	.len = len,
				3635	};
				3636
				3637	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3638	if (!bus)
				3639	return -ENOMEM;
				3640	r = __kvm_io_bus_read(vcpu, bus, &range, val);
				3641	return r < 0 ? r : 0;
				3642	}
				3643
				3644
				3645	/* Caller must hold slots_lock. */
				3646	int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
				3647	int len, struct kvm_io_device *dev)
				3648	{
				3649	int i;
				3650	struct kvm_io_bus new_bus, bus;
				3651	struct kvm_io_range range;
				3652
				3653	bus = kvm_get_bus(kvm, bus_idx);
				3654	if (!bus)
				3655	return -ENOMEM;
				3656
				3657	/* exclude ioeventfd which is limited by maximum fd */
				3658	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
				3659	return -ENOSPC;
				3660
				3661	new_bus = kmalloc(sizeof(bus) + ((bus->dev_count + 1)
				3662	sizeof(struct kvm_io_range)), GFP_KERNEL);
				3663	if (!new_bus)
				3664	return -ENOMEM;
				3665
				3666	range = (struct kvm_io_range) {
				3667	.addr = addr,
				3668	.len = len,
				3669	.dev = dev,
				3670	};
				3671
				3672	for (i = 0; i < bus->dev_count; i++)
				3673	if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
				3674	break;
				3675
				3676	memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
				3677	new_bus->dev_count++;
				3678	new_bus->range[i] = range;
				3679	memcpy(new_bus->range + i + 1, bus->range + i,
				3680	(bus->dev_count - i) * sizeof(struct kvm_io_range));
				3681	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
				3682	synchronize_srcu_expedited(&kvm->srcu);
				3683	kfree(bus);
				3684
				3685	return 0;
				3686	}
				3687
				3688	/* Caller must hold slots_lock. */
				3689	void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
				3690	struct kvm_io_device *dev)
				3691	{
				3692	int i;
				3693	struct kvm_io_bus new_bus, bus;
				3694
				3695	bus = kvm_get_bus(kvm, bus_idx);
				3696	if (!bus)
				3697	return;
				3698
				3699	for (i = 0; i < bus->dev_count; i++)
				3700	if (bus->range[i].dev == dev) {
				3701	break;
				3702	}
				3703
				3704	if (i == bus->dev_count)
				3705	return;
				3706
				3707	new_bus = kmalloc(sizeof(bus) + ((bus->dev_count - 1)
				3708	sizeof(struct kvm_io_range)), GFP_KERNEL);
				3709	if (!new_bus) {
				3710	pr_err("kvm: failed to shrink bus, removing it completely\n");
				3711	goto broken;
				3712	}
				3713
				3714	memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
				3715	new_bus->dev_count--;
				3716	memcpy(new_bus->range + i, bus->range + i + 1,
				3717	(new_bus->dev_count - i) * sizeof(struct kvm_io_range));
				3718
				3719	broken:
				3720	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
				3721	synchronize_srcu_expedited(&kvm->srcu);
				3722	kfree(bus);
				3723	return;
				3724	}
				3725
				3726	struct kvm_io_device kvm_io_bus_get_dev(struct kvm kvm, enum kvm_bus bus_idx,
				3727	gpa_t addr)
				3728	{
				3729	struct kvm_io_bus *bus;
				3730	int dev_idx, srcu_idx;
				3731	struct kvm_io_device *iodev = NULL;
				3732
				3733	srcu_idx = srcu_read_lock(&kvm->srcu);
				3734
				3735	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
				3736	if (!bus)
				3737	goto out_unlock;
				3738
				3739	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
				3740	if (dev_idx < 0)
				3741	goto out_unlock;
				3742
				3743	iodev = bus->range[dev_idx].dev;
				3744
				3745	out_unlock:
				3746	srcu_read_unlock(&kvm->srcu, srcu_idx);
				3747
				3748	return iodev;
				3749	}
				3750	EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
				3751
				3752	static int kvm_debugfs_open(struct inode inode, struct file file,
				3753	int (get)(void , u64 ), int (set)(void *, u64),
				3754	const char *fmt)
				3755	{
				3756	struct kvm_stat_data stat_data = (struct kvm_stat_data )
				3757	inode->i_private;
				3758
				3759	/* The debugfs files are a reference to the kvm struct which
				3760	* is still valid when kvm_destroy_vm is called.
				3761	* To avoid the race between open and the removal of the debugfs
				3762	* directory we test against the users count.
				3763	*/
				3764	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
				3765	return -ENOENT;
				3766
				3767	if (simple_attr_open(inode, file, get,
				3768	stat_data->mode & S_IWUGO ? set : NULL,
				3769	fmt)) {
				3770	kvm_put_kvm(stat_data->kvm);
				3771	return -ENOMEM;
				3772	}
				3773
				3774	return 0;
				3775	}
				3776
				3777	static int kvm_debugfs_release(struct inode inode, struct file file)
				3778	{
				3779	struct kvm_stat_data stat_data = (struct kvm_stat_data )
				3780	inode->i_private;
				3781
				3782	simple_attr_release(inode, file);
				3783	kvm_put_kvm(stat_data->kvm);
				3784
				3785	return 0;
				3786	}
				3787
				3788	static int vm_stat_get_per_vm(void data, u64 val)
				3789	{
				3790	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3791
				3792	val = (ulong )((void )stat_data->kvm + stat_data->offset);
				3793
				3794	return 0;
				3795	}
				3796
				3797	static int vm_stat_clear_per_vm(void *data, u64 val)
				3798	{
				3799	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3800
				3801	if (val)
				3802	return -EINVAL;
				3803
				3804	(ulong )((void *)stat_data->kvm + stat_data->offset) = 0;
				3805
				3806	return 0;
				3807	}
				3808
				3809	static int vm_stat_get_per_vm_open(struct inode inode, struct file file)
				3810	{
				3811	__simple_attr_check_format("%llu\n", 0ull);
				3812	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
				3813	vm_stat_clear_per_vm, "%llu\n");
				3814	}
				3815
				3816	static const struct file_operations vm_stat_get_per_vm_fops = {
				3817	.owner = THIS_MODULE,
				3818	.open = vm_stat_get_per_vm_open,
				3819	.release = kvm_debugfs_release,
				3820	.read = simple_attr_read,
				3821	.write = simple_attr_write,
				3822	.llseek = no_llseek,
				3823	};
				3824
				3825	static int vcpu_stat_get_per_vm(void data, u64 val)
				3826	{
				3827	int i;
				3828	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3829	struct kvm_vcpu *vcpu;
				3830
				3831	*val = 0;
				3832
				3833	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
				3834	val += (u64 )((void )vcpu + stat_data->offset);
				3835
				3836	return 0;
				3837	}
				3838
				3839	static int vcpu_stat_clear_per_vm(void *data, u64 val)
				3840	{
				3841	int i;
				3842	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3843	struct kvm_vcpu *vcpu;
				3844
				3845	if (val)
				3846	return -EINVAL;
				3847
				3848	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
				3849	(u64 )((void *)vcpu + stat_data->offset) = 0;
				3850
				3851	return 0;
				3852	}
				3853
				3854	static int vcpu_stat_get_per_vm_open(struct inode inode, struct file file)
				3855	{
				3856	__simple_attr_check_format("%llu\n", 0ull);
				3857	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
				3858	vcpu_stat_clear_per_vm, "%llu\n");
				3859	}
				3860
				3861	static const struct file_operations vcpu_stat_get_per_vm_fops = {
				3862	.owner = THIS_MODULE,
				3863	.open = vcpu_stat_get_per_vm_open,
				3864	.release = kvm_debugfs_release,
				3865	.read = simple_attr_read,
				3866	.write = simple_attr_write,
				3867	.llseek = no_llseek,
				3868	};
				3869
				3870	static const struct file_operations *stat_fops_per_vm[] = {
				3871	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
				3872	[KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
				3873	};
				3874
				3875	static int vm_stat_get(void _offset, u64 val)
				3876	{
				3877	unsigned offset = (long)_offset;
				3878	struct kvm *kvm;
				3879	struct kvm_stat_data stat_tmp = {.offset = offset};
				3880	u64 tmp_val;
				3881
				3882	*val = 0;
				3883	mutex_lock(&kvm_lock);
				3884	list_for_each_entry(kvm, &vm_list, vm_list) {
				3885	stat_tmp.kvm = kvm;
				3886	vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
				3887	*val += tmp_val;
				3888	}
				3889	mutex_unlock(&kvm_lock);
				3890	return 0;
				3891	}
				3892
				3893	static int vm_stat_clear(void *_offset, u64 val)
				3894	{
				3895	unsigned offset = (long)_offset;
				3896	struct kvm *kvm;
				3897	struct kvm_stat_data stat_tmp = {.offset = offset};
				3898
				3899	if (val)
				3900	return -EINVAL;
				3901
				3902	mutex_lock(&kvm_lock);
				3903	list_for_each_entry(kvm, &vm_list, vm_list) {
				3904	stat_tmp.kvm = kvm;
				3905	vm_stat_clear_per_vm((void *)&stat_tmp, 0);
				3906	}
				3907	mutex_unlock(&kvm_lock);
				3908
				3909	return 0;
				3910	}
				3911
				3912	DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
				3913
				3914	static int vcpu_stat_get(void _offset, u64 val)
				3915	{
				3916	unsigned offset = (long)_offset;
				3917	struct kvm *kvm;
				3918	struct kvm_stat_data stat_tmp = {.offset = offset};
				3919	u64 tmp_val;
				3920
				3921	*val = 0;
				3922	mutex_lock(&kvm_lock);
				3923	list_for_each_entry(kvm, &vm_list, vm_list) {
				3924	stat_tmp.kvm = kvm;
				3925	vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
				3926	*val += tmp_val;
				3927	}
				3928	mutex_unlock(&kvm_lock);
				3929	return 0;
				3930	}
				3931
				3932	static int vcpu_stat_clear(void *_offset, u64 val)
				3933	{
				3934	unsigned offset = (long)_offset;
				3935	struct kvm *kvm;
				3936	struct kvm_stat_data stat_tmp = {.offset = offset};
				3937
				3938	if (val)
				3939	return -EINVAL;
				3940
				3941	mutex_lock(&kvm_lock);
				3942	list_for_each_entry(kvm, &vm_list, vm_list) {
				3943	stat_tmp.kvm = kvm;
				3944	vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
				3945	}
				3946	mutex_unlock(&kvm_lock);
				3947
				3948	return 0;
				3949	}
				3950
				3951	DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
				3952	"%llu\n");
				3953
				3954	static const struct file_operations *stat_fops[] = {
				3955	[KVM_STAT_VCPU] = &vcpu_stat_fops,
				3956	[KVM_STAT_VM] = &vm_stat_fops,
				3957	};
				3958
				3959	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
				3960	{
				3961	struct kobj_uevent_env *env;
				3962	unsigned long long created, active;
				3963
				3964	if (!kvm_dev.this_device \|\| !kvm)
				3965	return;
				3966
				3967	mutex_lock(&kvm_lock);
				3968	if (type == KVM_EVENT_CREATE_VM) {
				3969	kvm_createvm_count++;
				3970	kvm_active_vms++;
				3971	} else if (type == KVM_EVENT_DESTROY_VM) {
				3972	kvm_active_vms--;
				3973	}
				3974	created = kvm_createvm_count;
				3975	active = kvm_active_vms;
				3976	mutex_unlock(&kvm_lock);
				3977
				3978	env = kzalloc(sizeof(*env), GFP_KERNEL);
				3979	if (!env)
				3980	return;
				3981
				3982	add_uevent_var(env, "CREATED=%llu", created);
				3983	add_uevent_var(env, "COUNT=%llu", active);
				3984
				3985	if (type == KVM_EVENT_CREATE_VM) {
				3986	add_uevent_var(env, "EVENT=create");
				3987	kvm->userspace_pid = task_pid_nr(current);
				3988	} else if (type == KVM_EVENT_DESTROY_VM) {
				3989	add_uevent_var(env, "EVENT=destroy");
				3990	}
				3991	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
				3992
				3993	if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
				3994	char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL);
				3995
				3996	if (p) {
				3997	tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
				3998	if (!IS_ERR(tmp))
				3999	add_uevent_var(env, "STATS_PATH=%s", tmp);
				4000	kfree(p);
				4001	}
				4002	}
				4003	/* no need for checks, since we are adding at most only 5 keys */
				4004	env->envp[env->envp_idx++] = NULL;
				4005	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
				4006	kfree(env);
				4007	}
				4008
				4009	static void kvm_init_debug(void)
				4010	{
				4011	struct kvm_stats_debugfs_item *p;
				4012
				4013	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
				4014
				4015	kvm_debugfs_num_entries = 0;
				4016	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
				4017	int mode = p->mode ? p->mode : 0644;
				4018	debugfs_create_file(p->name, mode, kvm_debugfs_dir,
				4019	(void *)(long)p->offset,
				4020	stat_fops[p->kind]);
				4021	}
				4022	}
				4023
				4024	static int kvm_suspend(void)
				4025	{
				4026	if (kvm_usage_count)
				4027	hardware_disable_nolock(NULL);
				4028	return 0;
				4029	}
				4030
				4031	static void kvm_resume(void)
				4032	{
				4033	if (kvm_usage_count) {
				4034	WARN_ON(raw_spin_is_locked(&kvm_count_lock));
				4035	hardware_enable_nolock(NULL);
				4036	}
				4037	}
				4038
				4039	static struct syscore_ops kvm_syscore_ops = {
				4040	.suspend = kvm_suspend,
				4041	.resume = kvm_resume,
				4042	};
				4043
				4044	static inline
				4045	struct kvm_vcpu preempt_notifier_to_vcpu(struct preempt_notifier pn)
				4046	{
				4047	return container_of(pn, struct kvm_vcpu, preempt_notifier);
				4048	}
				4049
				4050	static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
				4051	{
				4052	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
				4053
				4054	if (vcpu->preempted)
				4055	vcpu->preempted = false;
				4056
				4057	kvm_arch_sched_in(vcpu, cpu);
				4058
				4059	kvm_arch_vcpu_load(vcpu, cpu);
				4060	}
				4061
				4062	static void kvm_sched_out(struct preempt_notifier *pn,
				4063	struct task_struct *next)
				4064	{
				4065	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
				4066
				4067	if (current->state == TASK_RUNNING)
				4068	vcpu->preempted = true;
				4069	kvm_arch_vcpu_put(vcpu);
				4070	}
				4071
				4072	int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
				4073	struct module *module)
				4074	{
				4075	int r;
				4076	int cpu;
				4077
				4078	r = kvm_arch_init(opaque);
				4079	if (r)
				4080	goto out_fail;
				4081
				4082	/*
				4083	* kvm_arch_init makes sure there's at most one caller
				4084	* for architectures that support multiple implementations,
				4085	* like intel and amd on x86.
				4086	* kvm_arch_init must be called before kvm_irqfd_init to avoid creating
				4087	* conflicts in case kvm is already setup for another implementation.
				4088	*/
				4089	r = kvm_irqfd_init();
				4090	if (r)
				4091	goto out_irqfd;
				4092
				4093	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
				4094	r = -ENOMEM;
				4095	goto out_free_0;
				4096	}
				4097
				4098	r = kvm_arch_hardware_setup();
				4099	if (r < 0)
				4100	goto out_free_0a;
				4101
				4102	for_each_online_cpu(cpu) {
				4103	smp_call_function_single(cpu,
				4104	kvm_arch_check_processor_compat,
				4105	&r, 1);
				4106	if (r < 0)
				4107	goto out_free_1;
				4108	}
				4109
				4110	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
				4111	kvm_starting_cpu, kvm_dying_cpu);
				4112	if (r)
				4113	goto out_free_2;
				4114	register_reboot_notifier(&kvm_reboot_notifier);
				4115
				4116	/* A kmem cache lets us meet the alignment requirements of fx_save. */
				4117	if (!vcpu_align)
				4118	vcpu_align = __alignof__(struct kvm_vcpu);
				4119	kvm_vcpu_cache =
				4120	kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
				4121	SLAB_ACCOUNT,
				4122	offsetof(struct kvm_vcpu, arch),
				4123	sizeof_field(struct kvm_vcpu, arch),
				4124	NULL);
				4125	if (!kvm_vcpu_cache) {
				4126	r = -ENOMEM;
				4127	goto out_free_3;
				4128	}
				4129
				4130	r = kvm_async_pf_init();
				4131	if (r)
				4132	goto out_free;
				4133
				4134	kvm_chardev_ops.owner = module;
				4135	kvm_vm_fops.owner = module;
				4136	kvm_vcpu_fops.owner = module;
				4137
				4138	r = misc_register(&kvm_dev);
				4139	if (r) {
				4140	pr_err("kvm: misc device register failed\n");
				4141	goto out_unreg;
				4142	}
				4143
				4144	register_syscore_ops(&kvm_syscore_ops);
				4145
				4146	kvm_preempt_ops.sched_in = kvm_sched_in;
				4147	kvm_preempt_ops.sched_out = kvm_sched_out;
				4148
				4149	kvm_init_debug();
				4150
				4151	r = kvm_vfio_ops_init();
				4152	WARN_ON(r);
				4153
				4154	return 0;
				4155
				4156	out_unreg:
				4157	kvm_async_pf_deinit();
				4158	out_free:
				4159	kmem_cache_destroy(kvm_vcpu_cache);
				4160	out_free_3:
				4161	unregister_reboot_notifier(&kvm_reboot_notifier);
				4162	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
				4163	out_free_2:
				4164	out_free_1:
				4165	kvm_arch_hardware_unsetup();
				4166	out_free_0a:
				4167	free_cpumask_var(cpus_hardware_enabled);
				4168	out_free_0:
				4169	kvm_irqfd_exit();
				4170	out_irqfd:
				4171	kvm_arch_exit();
				4172	out_fail:
				4173	return r;
				4174	}
				4175	EXPORT_SYMBOL_GPL(kvm_init);
				4176
				4177	void kvm_exit(void)
				4178	{
				4179	debugfs_remove_recursive(kvm_debugfs_dir);
				4180	misc_deregister(&kvm_dev);
				4181	kmem_cache_destroy(kvm_vcpu_cache);
				4182	kvm_async_pf_deinit();
				4183	unregister_syscore_ops(&kvm_syscore_ops);
				4184	unregister_reboot_notifier(&kvm_reboot_notifier);
				4185	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
				4186	on_each_cpu(hardware_disable_nolock, NULL, 1);
				4187	kvm_arch_hardware_unsetup();
				4188	kvm_arch_exit();
				4189	kvm_irqfd_exit();
				4190	free_cpumask_var(cpus_hardware_enabled);
				4191	kvm_vfio_ops_exit();
				4192	}
				4193	EXPORT_SYMBOL_GPL(kvm_exit);
				4194
				4195	struct kvm_vm_worker_thread_context {
				4196	struct kvm *kvm;
				4197	struct task_struct *parent;
				4198	struct completion init_done;
				4199	kvm_vm_thread_fn_t thread_fn;
				4200	uintptr_t data;
				4201	int err;
				4202	};
				4203
				4204	static int kvm_vm_worker_thread(void *context)
				4205	{
				4206	/*
				4207	* The init_context is allocated on the stack of the parent thread, so
				4208	* we have to locally copy anything that is needed beyond initialization
				4209	*/
				4210	struct kvm_vm_worker_thread_context *init_context = context;
				4211	struct kvm *kvm = init_context->kvm;
				4212	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
				4213	uintptr_t data = init_context->data;
				4214	int err;
				4215
				4216	err = kthread_park(current);
				4217	/* kthread_park(current) is never supposed to return an error */
				4218	WARN_ON(err != 0);
				4219	if (err)
				4220	goto init_complete;
				4221
				4222	err = cgroup_attach_task_all(init_context->parent, current);
				4223	if (err) {
				4224	kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
				4225	__func__, err);
				4226	goto init_complete;
				4227	}
				4228
				4229	set_user_nice(current, task_nice(init_context->parent));
				4230
				4231	init_complete:
				4232	init_context->err = err;
				4233	complete(&init_context->init_done);
				4234	init_context = NULL;
				4235
				4236	if (err)
				4237	return err;
				4238
				4239	/* Wait to be woken up by the spawner before proceeding. */
				4240	kthread_parkme();
				4241
				4242	if (!kthread_should_stop())
				4243	err = thread_fn(kvm, data);
				4244
				4245	return err;
				4246	}
				4247
				4248	int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
				4249	uintptr_t data, const char *name,
				4250	struct task_struct **thread_ptr)
				4251	{
				4252	struct kvm_vm_worker_thread_context init_context = {};
				4253	struct task_struct *thread;
				4254
				4255	*thread_ptr = NULL;
				4256	init_context.kvm = kvm;
				4257	init_context.parent = current;
				4258	init_context.thread_fn = thread_fn;
				4259	init_context.data = data;
				4260	init_completion(&init_context.init_done);
				4261
				4262	thread = kthread_run(kvm_vm_worker_thread, &init_context,
				4263	"%s-%d", name, task_pid_nr(current));
				4264	if (IS_ERR(thread))
				4265	return PTR_ERR(thread);
				4266
				4267	/* kthread_run is never supposed to return NULL */
				4268	WARN_ON(thread == NULL);
				4269
				4270	wait_for_completion(&init_context.init_done);
				4271
				4272	if (!init_context.err)
				4273	*thread_ptr = thread;
				4274
				4275	return init_context.err;
				4276	}