Blame - src/kernel/linux/v4.14/virt/kvm/kvm_main.c - T103

blob: c1ca4d40157b165811cb18307a1ad0594edfd680 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* This module enables machines with Intel VT-x extensions to run virtual
				5	* machines without emulation or binary translation.
				6	*
				7	* Copyright (C) 2006 Qumranet, Inc.
				8	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				9	*
				10	* Authors:
				11	* Avi Kivity <avi@qumranet.com>
				12	* Yaniv Kamay <yaniv@qumranet.com>
				13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2. See
				15	* the COPYING file in the top-level directory.
				16	*
				17	*/
				18
				19	#include <kvm/iodev.h>
				20
				21	#include <linux/kvm_host.h>
				22	#include <linux/kvm.h>
				23	#include <linux/module.h>
				24	#include <linux/errno.h>
				25	#include <linux/percpu.h>
				26	#include <linux/mm.h>
				27	#include <linux/miscdevice.h>
				28	#include <linux/vmalloc.h>
				29	#include <linux/reboot.h>
				30	#include <linux/debugfs.h>
				31	#include <linux/highmem.h>
				32	#include <linux/file.h>
				33	#include <linux/syscore_ops.h>
				34	#include <linux/cpu.h>
				35	#include <linux/sched/signal.h>
				36	#include <linux/sched/mm.h>
				37	#include <linux/sched/stat.h>
				38	#include <linux/cpumask.h>
				39	#include <linux/smp.h>
				40	#include <linux/anon_inodes.h>
				41	#include <linux/profile.h>
				42	#include <linux/kvm_para.h>
				43	#include <linux/pagemap.h>
				44	#include <linux/mman.h>
				45	#include <linux/swap.h>
				46	#include <linux/bitops.h>
				47	#include <linux/spinlock.h>
				48	#include <linux/compat.h>
				49	#include <linux/srcu.h>
				50	#include <linux/hugetlb.h>
				51	#include <linux/slab.h>
				52	#include <linux/sort.h>
				53	#include <linux/bsearch.h>
				54	#include <linux/kthread.h>
				55
				56	#include <asm/processor.h>
				57	#include <asm/io.h>
				58	#include <asm/ioctl.h>
				59	#include <linux/uaccess.h>
				60	#include <asm/pgtable.h>
				61
				62	#include "coalesced_mmio.h"
				63	#include "async_pf.h"
				64	#include "vfio.h"
				65
				66	#define CREATE_TRACE_POINTS
				67	#include <trace/events/kvm.h>
				68
				69	/* Worst case buffer size needed for holding an integer. */
				70	#define ITOA_MAX_LEN 12
				71
				72	MODULE_AUTHOR("Qumranet");
				73	MODULE_LICENSE("GPL");
				74
				75	/* Architectures should define their poll value according to the halt latency */
				76	unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
				77	module_param(halt_poll_ns, uint, 0644);
				78	EXPORT_SYMBOL_GPL(halt_poll_ns);
				79
				80	/* Default doubles per-vcpu halt_poll_ns. */
				81	unsigned int halt_poll_ns_grow = 2;
				82	module_param(halt_poll_ns_grow, uint, 0644);
				83	EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
				84
				85	/* Default resets per-vcpu halt_poll_ns . */
				86	unsigned int halt_poll_ns_shrink;
				87	module_param(halt_poll_ns_shrink, uint, 0644);
				88	EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
				89
				90	/*
				91	* Ordering of locks:
				92	*
				93	* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
				94	*/
				95
				96	DEFINE_MUTEX(kvm_lock);
				97	static DEFINE_RAW_SPINLOCK(kvm_count_lock);
				98	LIST_HEAD(vm_list);
				99
				100	static cpumask_var_t cpus_hardware_enabled;
				101	static int kvm_usage_count;
				102	static atomic_t hardware_enable_failed;
				103
				104	struct kmem_cache *kvm_vcpu_cache;
				105	EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
				106
				107	static __read_mostly struct preempt_ops kvm_preempt_ops;
				108
				109	struct dentry *kvm_debugfs_dir;
				110	EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
				111
				112	static int kvm_debugfs_num_entries;
				113	static const struct file_operations *stat_fops_per_vm[];
				114
				115	static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
				116	unsigned long arg);
				117	#ifdef CONFIG_KVM_COMPAT
				118	static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
				119	unsigned long arg);
				120	#endif
				121	static int hardware_enable_all(void);
				122	static void hardware_disable_all(void);
				123
				124	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
				125
				126	static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
				127	static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
				128
				129	__visible bool kvm_rebooting;
				130	EXPORT_SYMBOL_GPL(kvm_rebooting);
				131
				132	static bool largepages_enabled = true;
				133
				134	#define KVM_EVENT_CREATE_VM 0
				135	#define KVM_EVENT_DESTROY_VM 1
				136	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
				137	static unsigned long long kvm_createvm_count;
				138	static unsigned long long kvm_active_vms;
				139
				140	__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
				141	unsigned long start, unsigned long end)
				142	{
				143	}
				144
				145	bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
				146	{
				147	/*
				148	* The metadata used by is_zone_device_page() to determine whether or
				149	* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
				150	* the device has been pinned, e.g. by get_user_pages(). WARN if the
				151	* page_count() is zero to help detect bad usage of this helper.
				152	*/
				153	if (!pfn_valid(pfn) \|\| WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
				154	return false;
				155
				156	return is_zone_device_page(pfn_to_page(pfn));
				157	}
				158
				159	bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
				160	{
				161	/*
				162	* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
				163	* perspective they are "normal" pages, albeit with slightly different
				164	* usage rules.
				165	*/
				166	if (pfn_valid(pfn))
				167	return PageReserved(pfn_to_page(pfn)) &&
				168	!is_zero_pfn(pfn) &&
				169	!kvm_is_zone_device_pfn(pfn);
				170
				171	return true;
				172	}
				173
				174	/*
				175	* Switches to specified vcpu, until a matching vcpu_put()
				176	*/
				177	int vcpu_load(struct kvm_vcpu *vcpu)
				178	{
				179	int cpu;
				180
				181	if (mutex_lock_killable(&vcpu->mutex))
				182	return -EINTR;
				183	cpu = get_cpu();
				184	preempt_notifier_register(&vcpu->preempt_notifier);
				185	kvm_arch_vcpu_load(vcpu, cpu);
				186	put_cpu();
				187	return 0;
				188	}
				189	EXPORT_SYMBOL_GPL(vcpu_load);
				190
				191	void vcpu_put(struct kvm_vcpu *vcpu)
				192	{
				193	preempt_disable();
				194	kvm_arch_vcpu_put(vcpu);
				195	preempt_notifier_unregister(&vcpu->preempt_notifier);
				196	preempt_enable();
				197	mutex_unlock(&vcpu->mutex);
				198	}
				199	EXPORT_SYMBOL_GPL(vcpu_put);
				200
				201	/* TODO: merge with kvm_arch_vcpu_should_kick */
				202	static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
				203	{
				204	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
				205
				206	/*
				207	* We need to wait for the VCPU to reenable interrupts and get out of
				208	* READING_SHADOW_PAGE_TABLES mode.
				209	*/
				210	if (req & KVM_REQUEST_WAIT)
				211	return mode != OUTSIDE_GUEST_MODE;
				212
				213	/*
				214	* Need to kick a running VCPU, but otherwise there is nothing to do.
				215	*/
				216	return mode == IN_GUEST_MODE;
				217	}
				218
				219	static void ack_flush(void *_completed)
				220	{
				221	}
				222
				223	static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
				224	{
				225	if (unlikely(!cpus))
				226	cpus = cpu_online_mask;
				227
				228	if (cpumask_empty(cpus))
				229	return false;
				230
				231	smp_call_function_many(cpus, ack_flush, NULL, wait);
				232	return true;
				233	}
				234
				235	bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
				236	{
				237	int i, cpu, me;
				238	cpumask_var_t cpus;
				239	bool called;
				240	struct kvm_vcpu *vcpu;
				241
				242	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
				243
				244	me = get_cpu();
				245	kvm_for_each_vcpu(i, vcpu, kvm) {
				246	kvm_make_request(req, vcpu);
				247	cpu = vcpu->cpu;
				248
				249	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
				250	continue;
				251
				252	if (cpus != NULL && cpu != -1 && cpu != me &&
				253	kvm_request_needs_ipi(vcpu, req))
				254	__cpumask_set_cpu(cpu, cpus);
				255	}
				256	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
				257	put_cpu();
				258	free_cpumask_var(cpus);
				259	return called;
				260	}
				261
				262	#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
				263	void kvm_flush_remote_tlbs(struct kvm *kvm)
				264	{
				265	/*
				266	* Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
				267	* kvm_make_all_cpus_request.
				268	*/
				269	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
				270
				271	/*
				272	* We want to publish modifications to the page tables before reading
				273	* mode. Pairs with a memory barrier in arch-specific code.
				274	* - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
				275	* and smp_mb in walk_shadow_page_lockless_begin/end.
				276	* - powerpc: smp_mb in kvmppc_prepare_to_enter.
				277	*
				278	* There is already an smp_mb__after_atomic() before
				279	* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
				280	* barrier here.
				281	*/
				282	if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
				283	++kvm->stat.remote_tlb_flush;
				284	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
				285	}
				286	EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
				287	#endif
				288
				289	void kvm_reload_remote_mmus(struct kvm *kvm)
				290	{
				291	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
				292	}
				293
				294	int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
				295	{
				296	struct page *page;
				297	int r;
				298
				299	mutex_init(&vcpu->mutex);
				300	vcpu->cpu = -1;
				301	vcpu->kvm = kvm;
				302	vcpu->vcpu_id = id;
				303	vcpu->pid = NULL;
				304	init_swait_queue_head(&vcpu->wq);
				305	kvm_async_pf_vcpu_init(vcpu);
				306
				307	vcpu->pre_pcpu = -1;
				308	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
				309
				310	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				311	if (!page) {
				312	r = -ENOMEM;
				313	goto fail;
				314	}
				315	vcpu->run = page_address(page);
				316
				317	kvm_vcpu_set_in_spin_loop(vcpu, false);
				318	kvm_vcpu_set_dy_eligible(vcpu, false);
				319	vcpu->preempted = false;
				320
				321	r = kvm_arch_vcpu_init(vcpu);
				322	if (r < 0)
				323	goto fail_free_run;
				324	return 0;
				325
				326	fail_free_run:
				327	free_page((unsigned long)vcpu->run);
				328	fail:
				329	return r;
				330	}
				331	EXPORT_SYMBOL_GPL(kvm_vcpu_init);
				332
				333	void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
				334	{
				335	/*
				336	* no need for rcu_read_lock as VCPU_RUN is the only place that
				337	* will change the vcpu->pid pointer and on uninit all file
				338	* descriptors are already gone.
				339	*/
				340	put_pid(rcu_dereference_protected(vcpu->pid, 1));
				341	kvm_arch_vcpu_uninit(vcpu);
				342	free_page((unsigned long)vcpu->run);
				343	}
				344	EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
				345
				346	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				347	static inline struct kvm mmu_notifier_to_kvm(struct mmu_notifier mn)
				348	{
				349	return container_of(mn, struct kvm, mmu_notifier);
				350	}
				351
				352	static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
				353	struct mm_struct *mm,
				354	unsigned long address,
				355	pte_t pte)
				356	{
				357	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				358	int idx;
				359
				360	idx = srcu_read_lock(&kvm->srcu);
				361	spin_lock(&kvm->mmu_lock);
				362	kvm->mmu_notifier_seq++;
				363	kvm_set_spte_hva(kvm, address, pte);
				364	spin_unlock(&kvm->mmu_lock);
				365	srcu_read_unlock(&kvm->srcu, idx);
				366	}
				367
				368	static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
				369	struct mm_struct *mm,
				370	unsigned long start,
				371	unsigned long end)
				372	{
				373	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				374	int need_tlb_flush = 0, idx;
				375
				376	idx = srcu_read_lock(&kvm->srcu);
				377	spin_lock(&kvm->mmu_lock);
				378	/*
				379	* The count increase must become visible at unlock time as no
				380	* spte can be established without taking the mmu_lock and
				381	* count is also read inside the mmu_lock critical section.
				382	*/
				383	kvm->mmu_notifier_count++;
				384	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
				385	need_tlb_flush \|= kvm->tlbs_dirty;
				386	/* we've to flush the tlb before the pages can be freed */
				387	if (need_tlb_flush)
				388	kvm_flush_remote_tlbs(kvm);
				389
				390	spin_unlock(&kvm->mmu_lock);
				391
				392	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
				393
				394	srcu_read_unlock(&kvm->srcu, idx);
				395	}
				396
				397	static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
				398	struct mm_struct *mm,
				399	unsigned long start,
				400	unsigned long end)
				401	{
				402	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				403
				404	spin_lock(&kvm->mmu_lock);
				405	/*
				406	* This sequence increase will notify the kvm page fault that
				407	* the page that is going to be mapped in the spte could have
				408	* been freed.
				409	*/
				410	kvm->mmu_notifier_seq++;
				411	smp_wmb();
				412	/*
				413	* The above sequence increase must be visible before the
				414	* below count decrease, which is ensured by the smp_wmb above
				415	* in conjunction with the smp_rmb in mmu_notifier_retry().
				416	*/
				417	kvm->mmu_notifier_count--;
				418	spin_unlock(&kvm->mmu_lock);
				419
				420	BUG_ON(kvm->mmu_notifier_count < 0);
				421	}
				422
				423	static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
				424	struct mm_struct *mm,
				425	unsigned long start,
				426	unsigned long end)
				427	{
				428	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				429	int young, idx;
				430
				431	idx = srcu_read_lock(&kvm->srcu);
				432	spin_lock(&kvm->mmu_lock);
				433
				434	young = kvm_age_hva(kvm, start, end);
				435	if (young)
				436	kvm_flush_remote_tlbs(kvm);
				437
				438	spin_unlock(&kvm->mmu_lock);
				439	srcu_read_unlock(&kvm->srcu, idx);
				440
				441	return young;
				442	}
				443
				444	static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
				445	struct mm_struct *mm,
				446	unsigned long start,
				447	unsigned long end)
				448	{
				449	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				450	int young, idx;
				451
				452	idx = srcu_read_lock(&kvm->srcu);
				453	spin_lock(&kvm->mmu_lock);
				454	/*
				455	* Even though we do not flush TLB, this will still adversely
				456	* affect performance on pre-Haswell Intel EPT, where there is
				457	* no EPT Access Bit to clear so that we have to tear down EPT
				458	* tables instead. If we find this unacceptable, we can always
				459	* add a parameter to kvm_age_hva so that it effectively doesn't
				460	* do anything on clear_young.
				461	*
				462	* Also note that currently we never issue secondary TLB flushes
				463	* from clear_young, leaving this job up to the regular system
				464	* cadence. If we find this inaccurate, we might come up with a
				465	* more sophisticated heuristic later.
				466	*/
				467	young = kvm_age_hva(kvm, start, end);
				468	spin_unlock(&kvm->mmu_lock);
				469	srcu_read_unlock(&kvm->srcu, idx);
				470
				471	return young;
				472	}
				473
				474	static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
				475	struct mm_struct *mm,
				476	unsigned long address)
				477	{
				478	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				479	int young, idx;
				480
				481	idx = srcu_read_lock(&kvm->srcu);
				482	spin_lock(&kvm->mmu_lock);
				483	young = kvm_test_age_hva(kvm, address);
				484	spin_unlock(&kvm->mmu_lock);
				485	srcu_read_unlock(&kvm->srcu, idx);
				486
				487	return young;
				488	}
				489
				490	static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
				491	struct mm_struct *mm)
				492	{
				493	struct kvm *kvm = mmu_notifier_to_kvm(mn);
				494	int idx;
				495
				496	idx = srcu_read_lock(&kvm->srcu);
				497	kvm_arch_flush_shadow_all(kvm);
				498	srcu_read_unlock(&kvm->srcu, idx);
				499	}
				500
				501	static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
				502	.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
				503	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
				504	.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
				505	.clear_young = kvm_mmu_notifier_clear_young,
				506	.test_young = kvm_mmu_notifier_test_young,
				507	.change_pte = kvm_mmu_notifier_change_pte,
				508	.release = kvm_mmu_notifier_release,
				509	};
				510
				511	static int kvm_init_mmu_notifier(struct kvm *kvm)
				512	{
				513	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
				514	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
				515	}
				516
				517	#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
				518
				519	static int kvm_init_mmu_notifier(struct kvm *kvm)
				520	{
				521	return 0;
				522	}
				523
				524	#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
				525
				526	static struct kvm_memslots *kvm_alloc_memslots(void)
				527	{
				528	int i;
				529	struct kvm_memslots *slots;
				530
				531	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
				532	if (!slots)
				533	return NULL;
				534
				535	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
				536	slots->id_to_index[i] = slots->memslots[i].id = i;
				537
				538	return slots;
				539	}
				540
				541	static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
				542	{
				543	if (!memslot->dirty_bitmap)
				544	return;
				545
				546	kvfree(memslot->dirty_bitmap);
				547	memslot->dirty_bitmap = NULL;
				548	}
				549
				550	/*
				551	* Free any memory in @free but not in @dont.
				552	*/
				553	static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
				554	struct kvm_memory_slot *dont)
				555	{
				556	if (!dont \|\| free->dirty_bitmap != dont->dirty_bitmap)
				557	kvm_destroy_dirty_bitmap(free);
				558
				559	kvm_arch_free_memslot(kvm, free, dont);
				560
				561	free->npages = 0;
				562	}
				563
				564	static void kvm_free_memslots(struct kvm kvm, struct kvm_memslots slots)
				565	{
				566	struct kvm_memory_slot *memslot;
				567
				568	if (!slots)
				569	return;
				570
				571	kvm_for_each_memslot(memslot, slots)
				572	kvm_free_memslot(kvm, memslot, NULL);
				573
				574	kvfree(slots);
				575	}
				576
				577	static void kvm_destroy_vm_debugfs(struct kvm *kvm)
				578	{
				579	int i;
				580
				581	if (!kvm->debugfs_dentry)
				582	return;
				583
				584	debugfs_remove_recursive(kvm->debugfs_dentry);
				585
				586	if (kvm->debugfs_stat_data) {
				587	for (i = 0; i < kvm_debugfs_num_entries; i++)
				588	kfree(kvm->debugfs_stat_data[i]);
				589	kfree(kvm->debugfs_stat_data);
				590	}
				591	}
				592
				593	static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
				594	{
				595	char dir_name[ITOA_MAX_LEN * 2];
				596	struct kvm_stat_data *stat_data;
				597	struct kvm_stats_debugfs_item *p;
				598
				599	if (!debugfs_initialized())
				600	return 0;
				601
				602	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
				603	kvm->debugfs_dentry = debugfs_create_dir(dir_name,
				604	kvm_debugfs_dir);
				605	if (!kvm->debugfs_dentry)
				606	return -ENOMEM;
				607
				608	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
				609	sizeof(*kvm->debugfs_stat_data),
				610	GFP_KERNEL);
				611	if (!kvm->debugfs_stat_data)
				612	return -ENOMEM;
				613
				614	for (p = debugfs_entries; p->name; p++) {
				615	stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
				616	if (!stat_data)
				617	return -ENOMEM;
				618
				619	stat_data->kvm = kvm;
				620	stat_data->offset = p->offset;
				621	stat_data->mode = p->mode ? p->mode : 0644;
				622	kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
				623	if (!debugfs_create_file(p->name, stat_data->mode,
				624	kvm->debugfs_dentry,
				625	stat_data,
				626	stat_fops_per_vm[p->kind]))
				627	return -ENOMEM;
				628	}
				629	return 0;
				630	}
				631
				632	/*
				633	* Called after the VM is otherwise initialized, but just before adding it to
				634	* the vm_list.
				635	*/
				636	int __weak kvm_arch_post_init_vm(struct kvm *kvm)
				637	{
				638	return 0;
				639	}
				640
				641	/*
				642	* Called just after removing the VM from the vm_list, but before doing any
				643	* other destruction.
				644	*/
				645	void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
				646	{
				647	}
				648
				649	static struct kvm *kvm_create_vm(unsigned long type)
				650	{
				651	int r, i;
				652	struct kvm *kvm = kvm_arch_alloc_vm();
				653
				654	if (!kvm)
				655	return ERR_PTR(-ENOMEM);
				656
				657	spin_lock_init(&kvm->mmu_lock);
				658	mmgrab(current->mm);
				659	kvm->mm = current->mm;
				660	kvm_eventfd_init(kvm);
				661	mutex_init(&kvm->lock);
				662	mutex_init(&kvm->irq_lock);
				663	mutex_init(&kvm->slots_lock);
				664	refcount_set(&kvm->users_count, 1);
				665	INIT_LIST_HEAD(&kvm->devices);
				666
				667	r = kvm_arch_init_vm(kvm, type);
				668	if (r)
				669	goto out_err_no_disable;
				670
				671	r = hardware_enable_all();
				672	if (r)
				673	goto out_err_no_disable;
				674
				675	#ifdef CONFIG_HAVE_KVM_IRQFD
				676	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
				677	#endif
				678
				679	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
				680
				681	r = -ENOMEM;
				682	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				683	struct kvm_memslots *slots = kvm_alloc_memslots();
				684	if (!slots)
				685	goto out_err_no_srcu;
				686	/*
				687	* Generations must be different for each address space.
				688	* Init kvm generation close to the maximum to easily test the
				689	* code of handling generation number wrap-around.
				690	*/
				691	slots->generation = i * 2 - 150;
				692	rcu_assign_pointer(kvm->memslots[i], slots);
				693	}
				694
				695	if (init_srcu_struct(&kvm->srcu))
				696	goto out_err_no_srcu;
				697	if (init_srcu_struct(&kvm->irq_srcu))
				698	goto out_err_no_irq_srcu;
				699	for (i = 0; i < KVM_NR_BUSES; i++) {
				700	rcu_assign_pointer(kvm->buses[i],
				701	kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
				702	if (!kvm->buses[i])
				703	goto out_err_no_mmu_notifier;
				704	}
				705
				706	r = kvm_init_mmu_notifier(kvm);
				707	if (r)
				708	goto out_err_no_mmu_notifier;
				709
				710	r = kvm_arch_post_init_vm(kvm);
				711	if (r)
				712	goto out_err;
				713
				714	mutex_lock(&kvm_lock);
				715	list_add(&kvm->vm_list, &vm_list);
				716	mutex_unlock(&kvm_lock);
				717
				718	preempt_notifier_inc();
				719
				720	return kvm;
				721
				722	out_err:
				723	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				724	if (kvm->mmu_notifier.ops)
				725	mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
				726	#endif
				727	out_err_no_mmu_notifier:
				728	cleanup_srcu_struct(&kvm->irq_srcu);
				729	out_err_no_irq_srcu:
				730	cleanup_srcu_struct(&kvm->srcu);
				731	out_err_no_srcu:
				732	hardware_disable_all();
				733	out_err_no_disable:
				734	refcount_set(&kvm->users_count, 0);
				735	for (i = 0; i < KVM_NR_BUSES; i++)
				736	kfree(kvm_get_bus(kvm, i));
				737	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
				738	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
				739	kvm_arch_free_vm(kvm);
				740	mmdrop(current->mm);
				741	return ERR_PTR(r);
				742	}
				743
				744	static void kvm_destroy_devices(struct kvm *kvm)
				745	{
				746	struct kvm_device dev, tmp;
				747
				748	/*
				749	* We do not need to take the kvm->lock here, because nobody else
				750	* has a reference to the struct kvm at this point and therefore
				751	* cannot access the devices list anyhow.
				752	*/
				753	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
				754	list_del(&dev->vm_node);
				755	dev->ops->destroy(dev);
				756	}
				757	}
				758
				759	static void kvm_destroy_vm(struct kvm *kvm)
				760	{
				761	int i;
				762	struct mm_struct *mm = kvm->mm;
				763
				764	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
				765	kvm_destroy_vm_debugfs(kvm);
				766	kvm_arch_sync_events(kvm);
				767	mutex_lock(&kvm_lock);
				768	list_del(&kvm->vm_list);
				769	mutex_unlock(&kvm_lock);
				770	kvm_arch_pre_destroy_vm(kvm);
				771
				772	kvm_free_irq_routing(kvm);
				773	for (i = 0; i < KVM_NR_BUSES; i++) {
				774	struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
				775
				776	if (bus)
				777	kvm_io_bus_destroy(bus);
				778	kvm->buses[i] = NULL;
				779	}
				780	kvm_coalesced_mmio_free(kvm);
				781	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
				782	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
				783	#else
				784	kvm_arch_flush_shadow_all(kvm);
				785	#endif
				786	kvm_arch_destroy_vm(kvm);
				787	kvm_destroy_devices(kvm);
				788	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
				789	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
				790	cleanup_srcu_struct(&kvm->irq_srcu);
				791	cleanup_srcu_struct(&kvm->srcu);
				792	kvm_arch_free_vm(kvm);
				793	preempt_notifier_dec();
				794	hardware_disable_all();
				795	mmdrop(mm);
				796	}
				797
				798	void kvm_get_kvm(struct kvm *kvm)
				799	{
				800	refcount_inc(&kvm->users_count);
				801	}
				802	EXPORT_SYMBOL_GPL(kvm_get_kvm);
				803
				804	void kvm_put_kvm(struct kvm *kvm)
				805	{
				806	if (refcount_dec_and_test(&kvm->users_count))
				807	kvm_destroy_vm(kvm);
				808	}
				809	EXPORT_SYMBOL_GPL(kvm_put_kvm);
				810
				811
				812	static int kvm_vm_release(struct inode inode, struct file filp)
				813	{
				814	struct kvm *kvm = filp->private_data;
				815
				816	kvm_irqfd_release(kvm);
				817
				818	kvm_put_kvm(kvm);
				819	return 0;
				820	}
				821
				822	/*
				823	* Allocation size is twice as large as the actual dirty bitmap size.
				824	* See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
				825	*/
				826	static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
				827	{
				828	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
				829
				830	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
				831	if (!memslot->dirty_bitmap)
				832	return -ENOMEM;
				833
				834	return 0;
				835	}
				836
				837	/*
				838	* Insert memslot and re-sort memslots based on their GFN,
				839	* so binary search could be used to lookup GFN.
				840	* Sorting algorithm takes advantage of having initially
				841	* sorted array and known changed memslot position.
				842	*/
				843	static void update_memslots(struct kvm_memslots *slots,
				844	struct kvm_memory_slot *new)
				845	{
				846	int id = new->id;
				847	int i = slots->id_to_index[id];
				848	struct kvm_memory_slot *mslots = slots->memslots;
				849
				850	WARN_ON(mslots[i].id != id);
				851	if (!new->npages) {
				852	WARN_ON(!mslots[i].npages);
				853	if (mslots[i].npages)
				854	slots->used_slots--;
				855	} else {
				856	if (!mslots[i].npages)
				857	slots->used_slots++;
				858	}
				859
				860	while (i < KVM_MEM_SLOTS_NUM - 1 &&
				861	new->base_gfn <= mslots[i + 1].base_gfn) {
				862	if (!mslots[i + 1].npages)
				863	break;
				864	mslots[i] = mslots[i + 1];
				865	slots->id_to_index[mslots[i].id] = i;
				866	i++;
				867	}
				868
				869	/*
				870	* The ">=" is needed when creating a slot with base_gfn == 0,
				871	* so that it moves before all those with base_gfn == npages == 0.
				872	*
				873	* On the other hand, if new->npages is zero, the above loop has
				874	* already left i pointing to the beginning of the empty part of
				875	* mslots, and the ">=" would move the hole backwards in this
				876	* case---which is wrong. So skip the loop when deleting a slot.
				877	*/
				878	if (new->npages) {
				879	while (i > 0 &&
				880	new->base_gfn >= mslots[i - 1].base_gfn) {
				881	mslots[i] = mslots[i - 1];
				882	slots->id_to_index[mslots[i].id] = i;
				883	i--;
				884	}
				885	} else
				886	WARN_ON_ONCE(i != slots->used_slots);
				887
				888	mslots[i] = *new;
				889	slots->id_to_index[mslots[i].id] = i;
				890	}
				891
				892	static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
				893	{
				894	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
				895
				896	#ifdef __KVM_HAVE_READONLY_MEM
				897	valid_flags \|= KVM_MEM_READONLY;
				898	#endif
				899
				900	if (mem->flags & ~valid_flags)
				901	return -EINVAL;
				902
				903	return 0;
				904	}
				905
				906	static struct kvm_memslots install_new_memslots(struct kvm kvm,
				907	int as_id, struct kvm_memslots *slots)
				908	{
				909	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
				910	u64 gen;
				911
				912	/*
				913	* Set the low bit in the generation, which disables SPTE caching
				914	* until the end of synchronize_srcu_expedited.
				915	*/
				916	WARN_ON(old_memslots->generation & 1);
				917	slots->generation = old_memslots->generation + 1;
				918
				919	rcu_assign_pointer(kvm->memslots[as_id], slots);
				920	synchronize_srcu_expedited(&kvm->srcu);
				921
				922	/*
				923	* Increment the new memslot generation a second time. This prevents
				924	* vm exits that race with memslot updates from caching a memslot
				925	* generation that will (potentially) be valid forever.
				926	*
				927	* Generations must be unique even across address spaces. We do not need
				928	* a global counter for that, instead the generation space is evenly split
				929	* across address spaces. For example, with two address spaces, address
				930	* space 0 will use generations 0, 4, 8, ... while * address space 1 will
				931	* use generations 2, 6, 10, 14, ...
				932	*/
				933	gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
				934
				935	kvm_arch_memslots_updated(kvm, gen);
				936
				937	slots->generation = gen;
				938
				939	return old_memslots;
				940	}
				941
				942	/*
				943	* Allocate some memory and give it an address in the guest physical address
				944	* space.
				945	*
				946	* Discontiguous memory is allowed, mostly for framebuffers.
				947	*
				948	* Must be called holding kvm->slots_lock for write.
				949	*/
				950	int __kvm_set_memory_region(struct kvm *kvm,
				951	const struct kvm_userspace_memory_region *mem)
				952	{
				953	int r;
				954	gfn_t base_gfn;
				955	unsigned long npages;
				956	struct kvm_memory_slot *slot;
				957	struct kvm_memory_slot old, new;
				958	struct kvm_memslots slots = NULL, old_memslots;
				959	int as_id, id;
				960	enum kvm_mr_change change;
				961
				962	r = check_memory_region_flags(mem);
				963	if (r)
				964	goto out;
				965
				966	r = -EINVAL;
				967	as_id = mem->slot >> 16;
				968	id = (u16)mem->slot;
				969
				970	/* General sanity checks */
				971	if (mem->memory_size & (PAGE_SIZE - 1))
				972	goto out;
				973	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
				974	goto out;
				975	/* We can read the guest memory with __xxx_user() later on. */
				976	if ((id < KVM_USER_MEM_SLOTS) &&
				977	((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
				978	!access_ok(VERIFY_WRITE,
				979	(void __user *)(unsigned long)mem->userspace_addr,
				980	mem->memory_size)))
				981	goto out;
				982	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_MEM_SLOTS_NUM)
				983	goto out;
				984	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
				985	goto out;
				986
				987	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
				988	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
				989	npages = mem->memory_size >> PAGE_SHIFT;
				990
				991	if (npages > KVM_MEM_MAX_NR_PAGES)
				992	goto out;
				993
				994	new = old = *slot;
				995
				996	new.id = id;
				997	new.base_gfn = base_gfn;
				998	new.npages = npages;
				999	new.flags = mem->flags;
				1000
				1001	if (npages) {
				1002	if (!old.npages)
				1003	change = KVM_MR_CREATE;
				1004	else { /* Modify an existing slot. */
				1005	if ((mem->userspace_addr != old.userspace_addr) \|\|
				1006	(npages != old.npages) \|\|
				1007	((new.flags ^ old.flags) & KVM_MEM_READONLY))
				1008	goto out;
				1009
				1010	if (base_gfn != old.base_gfn)
				1011	change = KVM_MR_MOVE;
				1012	else if (new.flags != old.flags)
				1013	change = KVM_MR_FLAGS_ONLY;
				1014	else { /* Nothing to change. */
				1015	r = 0;
				1016	goto out;
				1017	}
				1018	}
				1019	} else {
				1020	if (!old.npages)
				1021	goto out;
				1022
				1023	change = KVM_MR_DELETE;
				1024	new.base_gfn = 0;
				1025	new.flags = 0;
				1026	}
				1027
				1028	if ((change == KVM_MR_CREATE) \|\| (change == KVM_MR_MOVE)) {
				1029	/* Check for overlaps */
				1030	r = -EEXIST;
				1031	kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
				1032	if (slot->id == id)
				1033	continue;
				1034	if (!((base_gfn + npages <= slot->base_gfn) \|\|
				1035	(base_gfn >= slot->base_gfn + slot->npages)))
				1036	goto out;
				1037	}
				1038	}
				1039
				1040	/* Free page dirty bitmap if unneeded */
				1041	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
				1042	new.dirty_bitmap = NULL;
				1043
				1044	r = -ENOMEM;
				1045	if (change == KVM_MR_CREATE) {
				1046	new.userspace_addr = mem->userspace_addr;
				1047
				1048	if (kvm_arch_create_memslot(kvm, &new, npages))
				1049	goto out_free;
				1050	}
				1051
				1052	/* Allocate page dirty bitmap if needed */
				1053	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
				1054	if (kvm_create_dirty_bitmap(&new) < 0)
				1055	goto out_free;
				1056	}
				1057
				1058	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
				1059	if (!slots)
				1060	goto out_free;
				1061	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
				1062
				1063	if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {
				1064	slot = id_to_memslot(slots, id);
				1065	slot->flags \|= KVM_MEMSLOT_INVALID;
				1066
				1067	old_memslots = install_new_memslots(kvm, as_id, slots);
				1068
				1069	/* From this point no new shadow pages pointing to a deleted,
				1070	* or moved, memslot will be created.
				1071	*
				1072	* validation of sp->gfn happens in:
				1073	* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
				1074	* - kvm_is_visible_gfn (mmu_check_roots)
				1075	*/
				1076	kvm_arch_flush_shadow_memslot(kvm, slot);
				1077
				1078	/*
				1079	* We can re-use the old_memslots from above, the only difference
				1080	* from the currently installed memslots is the invalid flag. This
				1081	* will get overwritten by update_memslots anyway.
				1082	*/
				1083	slots = old_memslots;
				1084	}
				1085
				1086	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
				1087	if (r)
				1088	goto out_slots;
				1089
				1090	/* actual memory is freed via old in kvm_free_memslot below */
				1091	if (change == KVM_MR_DELETE) {
				1092	new.dirty_bitmap = NULL;
				1093	memset(&new.arch, 0, sizeof(new.arch));
				1094	}
				1095
				1096	update_memslots(slots, &new);
				1097	old_memslots = install_new_memslots(kvm, as_id, slots);
				1098
				1099	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
				1100
				1101	kvm_free_memslot(kvm, &old, &new);
				1102	kvfree(old_memslots);
				1103	return 0;
				1104
				1105	out_slots:
				1106	kvfree(slots);
				1107	out_free:
				1108	kvm_free_memslot(kvm, &new, &old);
				1109	out:
				1110	return r;
				1111	}
				1112	EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
				1113
				1114	int kvm_set_memory_region(struct kvm *kvm,
				1115	const struct kvm_userspace_memory_region *mem)
				1116	{
				1117	int r;
				1118
				1119	mutex_lock(&kvm->slots_lock);
				1120	r = __kvm_set_memory_region(kvm, mem);
				1121	mutex_unlock(&kvm->slots_lock);
				1122	return r;
				1123	}
				1124	EXPORT_SYMBOL_GPL(kvm_set_memory_region);
				1125
				1126	static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
				1127	struct kvm_userspace_memory_region *mem)
				1128	{
				1129	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
				1130	return -EINVAL;
				1131
				1132	return kvm_set_memory_region(kvm, mem);
				1133	}
				1134
				1135	int kvm_get_dirty_log(struct kvm *kvm,
				1136	struct kvm_dirty_log log, int is_dirty)
				1137	{
				1138	struct kvm_memslots *slots;
				1139	struct kvm_memory_slot *memslot;
				1140	int i, as_id, id;
				1141	unsigned long n;
				1142	unsigned long any = 0;
				1143
				1144	as_id = log->slot >> 16;
				1145	id = (u16)log->slot;
				1146	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1147	return -EINVAL;
				1148
				1149	slots = __kvm_memslots(kvm, as_id);
				1150	memslot = id_to_memslot(slots, id);
				1151	if (!memslot->dirty_bitmap)
				1152	return -ENOENT;
				1153
				1154	n = kvm_dirty_bitmap_bytes(memslot);
				1155
				1156	for (i = 0; !any && i < n/sizeof(long); ++i)
				1157	any = memslot->dirty_bitmap[i];
				1158
				1159	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
				1160	return -EFAULT;
				1161
				1162	if (any)
				1163	*is_dirty = 1;
				1164	return 0;
				1165	}
				1166	EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
				1167
				1168	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
				1169	/**
				1170	* kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
				1171	* are dirty write protect them for next write.
				1172	* @kvm: pointer to kvm instance
				1173	* @log: slot id and address to which we copy the log
				1174	* @is_dirty: flag set if any page is dirty
				1175	*
				1176	* We need to keep it in mind that VCPU threads can write to the bitmap
				1177	* concurrently. So, to avoid losing track of dirty pages we keep the
				1178	* following order:
				1179	*
				1180	* 1. Take a snapshot of the bit and clear it if needed.
				1181	* 2. Write protect the corresponding page.
				1182	* 3. Copy the snapshot to the userspace.
				1183	* 4. Upon return caller flushes TLB's if needed.
				1184	*
				1185	* Between 2 and 4, the guest may write to the page using the remaining TLB
				1186	* entry. This is not a problem because the page is reported dirty using
				1187	* the snapshot taken before and step 4 ensures that writes done after
				1188	* exiting to userspace will be logged for the next call.
				1189	*
				1190	*/
				1191	int kvm_get_dirty_log_protect(struct kvm *kvm,
				1192	struct kvm_dirty_log log, bool is_dirty)
				1193	{
				1194	struct kvm_memslots *slots;
				1195	struct kvm_memory_slot *memslot;
				1196	int i, as_id, id;
				1197	unsigned long n;
				1198	unsigned long *dirty_bitmap;
				1199	unsigned long *dirty_bitmap_buffer;
				1200
				1201	as_id = log->slot >> 16;
				1202	id = (u16)log->slot;
				1203	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
				1204	return -EINVAL;
				1205
				1206	slots = __kvm_memslots(kvm, as_id);
				1207	memslot = id_to_memslot(slots, id);
				1208
				1209	dirty_bitmap = memslot->dirty_bitmap;
				1210	if (!dirty_bitmap)
				1211	return -ENOENT;
				1212
				1213	n = kvm_dirty_bitmap_bytes(memslot);
				1214
				1215	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
				1216	memset(dirty_bitmap_buffer, 0, n);
				1217
				1218	spin_lock(&kvm->mmu_lock);
				1219	*is_dirty = false;
				1220	for (i = 0; i < n / sizeof(long); i++) {
				1221	unsigned long mask;
				1222	gfn_t offset;
				1223
				1224	if (!dirty_bitmap[i])
				1225	continue;
				1226
				1227	*is_dirty = true;
				1228
				1229	mask = xchg(&dirty_bitmap[i], 0);
				1230	dirty_bitmap_buffer[i] = mask;
				1231
				1232	if (mask) {
				1233	offset = i * BITS_PER_LONG;
				1234	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
				1235	offset, mask);
				1236	}
				1237	}
				1238
				1239	spin_unlock(&kvm->mmu_lock);
				1240	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
				1241	return -EFAULT;
				1242	return 0;
				1243	}
				1244	EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
				1245	#endif
				1246
				1247	bool kvm_largepages_enabled(void)
				1248	{
				1249	return largepages_enabled;
				1250	}
				1251
				1252	void kvm_disable_largepages(void)
				1253	{
				1254	largepages_enabled = false;
				1255	}
				1256	EXPORT_SYMBOL_GPL(kvm_disable_largepages);
				1257
				1258	struct kvm_memory_slot gfn_to_memslot(struct kvm kvm, gfn_t gfn)
				1259	{
				1260	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
				1261	}
				1262	EXPORT_SYMBOL_GPL(gfn_to_memslot);
				1263
				1264	struct kvm_memory_slot kvm_vcpu_gfn_to_memslot(struct kvm_vcpu vcpu, gfn_t gfn)
				1265	{
				1266	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
				1267	}
				1268
				1269	bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
				1270	{
				1271	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
				1272
				1273	if (!memslot \|\| memslot->id >= KVM_USER_MEM_SLOTS \|\|
				1274	memslot->flags & KVM_MEMSLOT_INVALID)
				1275	return false;
				1276
				1277	return true;
				1278	}
				1279	EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
				1280
				1281	unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
				1282	{
				1283	struct vm_area_struct *vma;
				1284	unsigned long addr, size;
				1285
				1286	size = PAGE_SIZE;
				1287
				1288	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
				1289	if (kvm_is_error_hva(addr))
				1290	return PAGE_SIZE;
				1291
				1292	down_read(&current->mm->mmap_sem);
				1293	vma = find_vma(current->mm, addr);
				1294	if (!vma)
				1295	goto out;
				1296
				1297	size = vma_kernel_pagesize(vma);
				1298
				1299	out:
				1300	up_read(&current->mm->mmap_sem);
				1301
				1302	return size;
				1303	}
				1304
				1305	static bool memslot_is_readonly(struct kvm_memory_slot *slot)
				1306	{
				1307	return slot->flags & KVM_MEM_READONLY;
				1308	}
				1309
				1310	static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
				1311	gfn_t *nr_pages, bool write)
				1312	{
				1313	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
				1314	return KVM_HVA_ERR_BAD;
				1315
				1316	if (memslot_is_readonly(slot) && write)
				1317	return KVM_HVA_ERR_RO_BAD;
				1318
				1319	if (nr_pages)
				1320	*nr_pages = slot->npages - (gfn - slot->base_gfn);
				1321
				1322	return __gfn_to_hva_memslot(slot, gfn);
				1323	}
				1324
				1325	static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
				1326	gfn_t *nr_pages)
				1327	{
				1328	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
				1329	}
				1330
				1331	unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
				1332	gfn_t gfn)
				1333	{
				1334	return gfn_to_hva_many(slot, gfn, NULL);
				1335	}
				1336	EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
				1337
				1338	unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
				1339	{
				1340	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
				1341	}
				1342	EXPORT_SYMBOL_GPL(gfn_to_hva);
				1343
				1344	unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
				1345	{
				1346	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
				1347	}
				1348	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
				1349
				1350	/*
				1351	* If writable is set to false, the hva returned by this function is only
				1352	* allowed to be read.
				1353	*/
				1354	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
				1355	gfn_t gfn, bool *writable)
				1356	{
				1357	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
				1358
				1359	if (!kvm_is_error_hva(hva) && writable)
				1360	*writable = !memslot_is_readonly(slot);
				1361
				1362	return hva;
				1363	}
				1364
				1365	unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)
				1366	{
				1367	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1368
				1369	return gfn_to_hva_memslot_prot(slot, gfn, writable);
				1370	}
				1371
				1372	unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu vcpu, gfn_t gfn, bool writable)
				1373	{
				1374	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1375
				1376	return gfn_to_hva_memslot_prot(slot, gfn, writable);
				1377	}
				1378
				1379	static int get_user_page_nowait(unsigned long start, int write,
				1380	struct page **page)
				1381	{
				1382	int flags = FOLL_NOWAIT \| FOLL_HWPOISON;
				1383
				1384	if (write)
				1385	flags \|= FOLL_WRITE;
				1386
				1387	return get_user_pages(start, 1, flags, page, NULL);
				1388	}
				1389
				1390	static inline int check_user_page_hwpoison(unsigned long addr)
				1391	{
				1392	int rc, flags = FOLL_HWPOISON \| FOLL_WRITE;
				1393
				1394	rc = get_user_pages(addr, 1, flags, NULL, NULL);
				1395	return rc == -EHWPOISON;
				1396	}
				1397
				1398	/*
				1399	* The atomic path to get the writable pfn which will be stored in @pfn,
				1400	* true indicates success, otherwise false is returned.
				1401	*/
				1402	static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
				1403	bool write_fault, bool writable, kvm_pfn_t pfn)
				1404	{
				1405	struct page *page[1];
				1406	int npages;
				1407
				1408	if (!(async \|\| atomic))
				1409	return false;
				1410
				1411	/*
				1412	* Fast pin a writable pfn only if it is a write fault request
				1413	* or the caller allows to map a writable pfn for a read fault
				1414	* request.
				1415	*/
				1416	if (!(write_fault \|\| writable))
				1417	return false;
				1418
				1419	npages = __get_user_pages_fast(addr, 1, 1, page);
				1420	if (npages == 1) {
				1421	*pfn = page_to_pfn(page[0]);
				1422
				1423	if (writable)
				1424	*writable = true;
				1425	return true;
				1426	}
				1427
				1428	return false;
				1429	}
				1430
				1431	/*
				1432	* The slow path to get the pfn of the specified host virtual address,
				1433	* 1 indicates success, -errno is returned if error is detected.
				1434	*/
				1435	static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
				1436	bool writable, kvm_pfn_t pfn)
				1437	{
				1438	struct page *page[1];
				1439	int npages = 0;
				1440
				1441	might_sleep();
				1442
				1443	if (writable)
				1444	*writable = write_fault;
				1445
				1446	if (async) {
				1447	down_read(&current->mm->mmap_sem);
				1448	npages = get_user_page_nowait(addr, write_fault, page);
				1449	up_read(&current->mm->mmap_sem);
				1450	} else {
				1451	unsigned int flags = FOLL_HWPOISON;
				1452
				1453	if (write_fault)
				1454	flags \|= FOLL_WRITE;
				1455
				1456	npages = get_user_pages_unlocked(addr, 1, page, flags);
				1457	}
				1458	if (npages != 1)
				1459	return npages;
				1460
				1461	/* map read fault as writable if possible */
				1462	if (unlikely(!write_fault) && writable) {
				1463	struct page *wpage[1];
				1464
				1465	npages = __get_user_pages_fast(addr, 1, 1, wpage);
				1466	if (npages == 1) {
				1467	*writable = true;
				1468	put_page(page[0]);
				1469	page[0] = wpage[0];
				1470	}
				1471
				1472	npages = 1;
				1473	}
				1474	*pfn = page_to_pfn(page[0]);
				1475	return npages;
				1476	}
				1477
				1478	static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
				1479	{
				1480	if (unlikely(!(vma->vm_flags & VM_READ)))
				1481	return false;
				1482
				1483	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
				1484	return false;
				1485
				1486	return true;
				1487	}
				1488
				1489	static int hva_to_pfn_remapped(struct vm_area_struct *vma,
				1490	unsigned long addr, bool *async,
				1491	bool write_fault, bool *writable,
				1492	kvm_pfn_t *p_pfn)
				1493	{
				1494	unsigned long pfn;
				1495	int r;
				1496
				1497	r = follow_pfn(vma, addr, &pfn);
				1498	if (r) {
				1499	/*
				1500	* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
				1501	* not call the fault handler, so do it here.
				1502	*/
				1503	bool unlocked = false;
				1504	r = fixup_user_fault(current, current->mm, addr,
				1505	(write_fault ? FAULT_FLAG_WRITE : 0),
				1506	&unlocked);
				1507	if (unlocked)
				1508	return -EAGAIN;
				1509	if (r)
				1510	return r;
				1511
				1512	r = follow_pfn(vma, addr, &pfn);
				1513	if (r)
				1514	return r;
				1515
				1516	}
				1517
				1518	if (writable)
				1519	*writable = true;
				1520
				1521	/*
				1522	* Get a reference here because callers of hva_to_pfn and
				1523	* gfn_to_pfn ultimately call kvm_release_pfn_clean on the
				1524	* returned pfn. This is only needed if the VMA has VM_MIXEDMAP
				1525	* set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
				1526	* simply do nothing for reserved pfns.
				1527	*
				1528	* Whoever called remap_pfn_range is also going to call e.g.
				1529	* unmap_mapping_range before the underlying pages are freed,
				1530	* causing a call to our MMU notifier.
				1531	*/
				1532	kvm_get_pfn(pfn);
				1533
				1534	*p_pfn = pfn;
				1535	return 0;
				1536	}
				1537
				1538	/*
				1539	* Pin guest page in memory and return its pfn.
				1540	* @addr: host virtual address which maps memory to the guest
				1541	* @atomic: whether this function can sleep
				1542	* @async: whether this function need to wait IO complete if the
				1543	* host page is not in the memory
				1544	* @write_fault: whether we should get a writable host page
				1545	* @writable: whether it allows to map a writable host page for !@write_fault
				1546	*
				1547	* The function will map a writable host page for these two cases:
				1548	* 1): @write_fault = true
				1549	* 2): @write_fault = false && @writable, @writable will tell the caller
				1550	* whether the mapping is writable.
				1551	*/
				1552	static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
				1553	bool write_fault, bool *writable)
				1554	{
				1555	struct vm_area_struct *vma;
				1556	kvm_pfn_t pfn = 0;
				1557	int npages, r;
				1558
				1559	/* we can do it either atomically or asynchronously, not both */
				1560	BUG_ON(atomic && async);
				1561
				1562	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
				1563	return pfn;
				1564
				1565	if (atomic)
				1566	return KVM_PFN_ERR_FAULT;
				1567
				1568	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
				1569	if (npages == 1)
				1570	return pfn;
				1571
				1572	down_read(&current->mm->mmap_sem);
				1573	if (npages == -EHWPOISON \|\|
				1574	(!async && check_user_page_hwpoison(addr))) {
				1575	pfn = KVM_PFN_ERR_HWPOISON;
				1576	goto exit;
				1577	}
				1578
				1579	retry:
				1580	vma = find_vma_intersection(current->mm, addr, addr + 1);
				1581
				1582	if (vma == NULL)
				1583	pfn = KVM_PFN_ERR_FAULT;
				1584	else if (vma->vm_flags & (VM_IO \| VM_PFNMAP)) {
				1585	r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
				1586	if (r == -EAGAIN)
				1587	goto retry;
				1588	if (r < 0)
				1589	pfn = KVM_PFN_ERR_FAULT;
				1590	} else {
				1591	if (async && vma_is_valid(vma, write_fault))
				1592	*async = true;
				1593	pfn = KVM_PFN_ERR_FAULT;
				1594	}
				1595	exit:
				1596	up_read(&current->mm->mmap_sem);
				1597	return pfn;
				1598	}
				1599
				1600	kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
				1601	bool atomic, bool *async, bool write_fault,
				1602	bool *writable)
				1603	{
				1604	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
				1605
				1606	if (addr == KVM_HVA_ERR_RO_BAD) {
				1607	if (writable)
				1608	*writable = false;
				1609	return KVM_PFN_ERR_RO_FAULT;
				1610	}
				1611
				1612	if (kvm_is_error_hva(addr)) {
				1613	if (writable)
				1614	*writable = false;
				1615	return KVM_PFN_NOSLOT;
				1616	}
				1617
				1618	/* Do not map writable pfn in the readonly memslot. */
				1619	if (writable && memslot_is_readonly(slot)) {
				1620	*writable = false;
				1621	writable = NULL;
				1622	}
				1623
				1624	return hva_to_pfn(addr, atomic, async, write_fault,
				1625	writable);
				1626	}
				1627	EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
				1628
				1629	kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
				1630	bool *writable)
				1631	{
				1632	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
				1633	write_fault, writable);
				1634	}
				1635	EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
				1636
				1637	kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
				1638	{
				1639	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
				1640	}
				1641	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
				1642
				1643	kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
				1644	{
				1645	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
				1646	}
				1647	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
				1648
				1649	kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
				1650	{
				1651	return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
				1652	}
				1653	EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
				1654
				1655	kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
				1656	{
				1657	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
				1658	}
				1659	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
				1660
				1661	kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
				1662	{
				1663	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
				1664	}
				1665	EXPORT_SYMBOL_GPL(gfn_to_pfn);
				1666
				1667	kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				1668	{
				1669	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
				1670	}
				1671	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
				1672
				1673	int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
				1674	struct page **pages, int nr_pages)
				1675	{
				1676	unsigned long addr;
				1677	gfn_t entry = 0;
				1678
				1679	addr = gfn_to_hva_many(slot, gfn, &entry);
				1680	if (kvm_is_error_hva(addr))
				1681	return -1;
				1682
				1683	if (entry < nr_pages)
				1684	return 0;
				1685
				1686	return __get_user_pages_fast(addr, nr_pages, 1, pages);
				1687	}
				1688	EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
				1689
				1690	static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
				1691	{
				1692	if (is_error_noslot_pfn(pfn))
				1693	return KVM_ERR_PTR_BAD_PAGE;
				1694
				1695	if (kvm_is_reserved_pfn(pfn)) {
				1696	WARN_ON(1);
				1697	return KVM_ERR_PTR_BAD_PAGE;
				1698	}
				1699
				1700	return pfn_to_page(pfn);
				1701	}
				1702
				1703	struct page gfn_to_page(struct kvm kvm, gfn_t gfn)
				1704	{
				1705	kvm_pfn_t pfn;
				1706
				1707	pfn = gfn_to_pfn(kvm, gfn);
				1708
				1709	return kvm_pfn_to_page(pfn);
				1710	}
				1711	EXPORT_SYMBOL_GPL(gfn_to_page);
				1712
				1713	struct page kvm_vcpu_gfn_to_page(struct kvm_vcpu vcpu, gfn_t gfn)
				1714	{
				1715	kvm_pfn_t pfn;
				1716
				1717	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
				1718
				1719	return kvm_pfn_to_page(pfn);
				1720	}
				1721	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
				1722
				1723	void kvm_release_page_clean(struct page *page)
				1724	{
				1725	WARN_ON(is_error_page(page));
				1726
				1727	kvm_release_pfn_clean(page_to_pfn(page));
				1728	}
				1729	EXPORT_SYMBOL_GPL(kvm_release_page_clean);
				1730
				1731	void kvm_release_pfn_clean(kvm_pfn_t pfn)
				1732	{
				1733	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
				1734	put_page(pfn_to_page(pfn));
				1735	}
				1736	EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
				1737
				1738	void kvm_release_page_dirty(struct page *page)
				1739	{
				1740	WARN_ON(is_error_page(page));
				1741
				1742	kvm_release_pfn_dirty(page_to_pfn(page));
				1743	}
				1744	EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
				1745
				1746	static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
				1747	{
				1748	kvm_set_pfn_dirty(pfn);
				1749	kvm_release_pfn_clean(pfn);
				1750	}
				1751
				1752	void kvm_set_pfn_dirty(kvm_pfn_t pfn)
				1753	{
				1754	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
				1755	struct page *page = pfn_to_page(pfn);
				1756
				1757	if (!PageReserved(page))
				1758	SetPageDirty(page);
				1759	}
				1760	}
				1761	EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
				1762
				1763	void kvm_set_pfn_accessed(kvm_pfn_t pfn)
				1764	{
				1765	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
				1766	mark_page_accessed(pfn_to_page(pfn));
				1767	}
				1768	EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
				1769
				1770	void kvm_get_pfn(kvm_pfn_t pfn)
				1771	{
				1772	if (!kvm_is_reserved_pfn(pfn))
				1773	get_page(pfn_to_page(pfn));
				1774	}
				1775	EXPORT_SYMBOL_GPL(kvm_get_pfn);
				1776
				1777	static int next_segment(unsigned long len, int offset)
				1778	{
				1779	if (len > PAGE_SIZE - offset)
				1780	return PAGE_SIZE - offset;
				1781	else
				1782	return len;
				1783	}
				1784
				1785	static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
				1786	void *data, int offset, int len)
				1787	{
				1788	int r;
				1789	unsigned long addr;
				1790
				1791	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
				1792	if (kvm_is_error_hva(addr))
				1793	return -EFAULT;
				1794	r = __copy_from_user(data, (void __user *)addr + offset, len);
				1795	if (r)
				1796	return -EFAULT;
				1797	return 0;
				1798	}
				1799
				1800	int kvm_read_guest_page(struct kvm kvm, gfn_t gfn, void data, int offset,
				1801	int len)
				1802	{
				1803	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1804
				1805	return __kvm_read_guest_page(slot, gfn, data, offset, len);
				1806	}
				1807	EXPORT_SYMBOL_GPL(kvm_read_guest_page);
				1808
				1809	int kvm_vcpu_read_guest_page(struct kvm_vcpu vcpu, gfn_t gfn, void data,
				1810	int offset, int len)
				1811	{
				1812	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1813
				1814	return __kvm_read_guest_page(slot, gfn, data, offset, len);
				1815	}
				1816	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
				1817
				1818	int kvm_read_guest(struct kvm kvm, gpa_t gpa, void data, unsigned long len)
				1819	{
				1820	gfn_t gfn = gpa >> PAGE_SHIFT;
				1821	int seg;
				1822	int offset = offset_in_page(gpa);
				1823	int ret;
				1824
				1825	while ((seg = next_segment(len, offset)) != 0) {
				1826	ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
				1827	if (ret < 0)
				1828	return ret;
				1829	offset = 0;
				1830	len -= seg;
				1831	data += seg;
				1832	++gfn;
				1833	}
				1834	return 0;
				1835	}
				1836	EXPORT_SYMBOL_GPL(kvm_read_guest);
				1837
				1838	int kvm_vcpu_read_guest(struct kvm_vcpu vcpu, gpa_t gpa, void data, unsigned long len)
				1839	{
				1840	gfn_t gfn = gpa >> PAGE_SHIFT;
				1841	int seg;
				1842	int offset = offset_in_page(gpa);
				1843	int ret;
				1844
				1845	while ((seg = next_segment(len, offset)) != 0) {
				1846	ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
				1847	if (ret < 0)
				1848	return ret;
				1849	offset = 0;
				1850	len -= seg;
				1851	data += seg;
				1852	++gfn;
				1853	}
				1854	return 0;
				1855	}
				1856	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
				1857
				1858	static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
				1859	void *data, int offset, unsigned long len)
				1860	{
				1861	int r;
				1862	unsigned long addr;
				1863
				1864	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
				1865	if (kvm_is_error_hva(addr))
				1866	return -EFAULT;
				1867	pagefault_disable();
				1868	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
				1869	pagefault_enable();
				1870	if (r)
				1871	return -EFAULT;
				1872	return 0;
				1873	}
				1874
				1875	int kvm_read_guest_atomic(struct kvm kvm, gpa_t gpa, void data,
				1876	unsigned long len)
				1877	{
				1878	gfn_t gfn = gpa >> PAGE_SHIFT;
				1879	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1880	int offset = offset_in_page(gpa);
				1881
				1882	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
				1883	}
				1884	EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
				1885
				1886	int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
				1887	void *data, unsigned long len)
				1888	{
				1889	gfn_t gfn = gpa >> PAGE_SHIFT;
				1890	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1891	int offset = offset_in_page(gpa);
				1892
				1893	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
				1894	}
				1895	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
				1896
				1897	static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
				1898	const void *data, int offset, int len)
				1899	{
				1900	int r;
				1901	unsigned long addr;
				1902
				1903	addr = gfn_to_hva_memslot(memslot, gfn);
				1904	if (kvm_is_error_hva(addr))
				1905	return -EFAULT;
				1906	r = __copy_to_user((void __user *)addr + offset, data, len);
				1907	if (r)
				1908	return -EFAULT;
				1909	mark_page_dirty_in_slot(memslot, gfn);
				1910	return 0;
				1911	}
				1912
				1913	int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
				1914	const void *data, int offset, int len)
				1915	{
				1916	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
				1917
				1918	return __kvm_write_guest_page(slot, gfn, data, offset, len);
				1919	}
				1920	EXPORT_SYMBOL_GPL(kvm_write_guest_page);
				1921
				1922	int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
				1923	const void *data, int offset, int len)
				1924	{
				1925	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1926
				1927	return __kvm_write_guest_page(slot, gfn, data, offset, len);
				1928	}
				1929	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
				1930
				1931	int kvm_write_guest(struct kvm kvm, gpa_t gpa, const void data,
				1932	unsigned long len)
				1933	{
				1934	gfn_t gfn = gpa >> PAGE_SHIFT;
				1935	int seg;
				1936	int offset = offset_in_page(gpa);
				1937	int ret;
				1938
				1939	while ((seg = next_segment(len, offset)) != 0) {
				1940	ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
				1941	if (ret < 0)
				1942	return ret;
				1943	offset = 0;
				1944	len -= seg;
				1945	data += seg;
				1946	++gfn;
				1947	}
				1948	return 0;
				1949	}
				1950	EXPORT_SYMBOL_GPL(kvm_write_guest);
				1951
				1952	int kvm_vcpu_write_guest(struct kvm_vcpu vcpu, gpa_t gpa, const void data,
				1953	unsigned long len)
				1954	{
				1955	gfn_t gfn = gpa >> PAGE_SHIFT;
				1956	int seg;
				1957	int offset = offset_in_page(gpa);
				1958	int ret;
				1959
				1960	while ((seg = next_segment(len, offset)) != 0) {
				1961	ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
				1962	if (ret < 0)
				1963	return ret;
				1964	offset = 0;
				1965	len -= seg;
				1966	data += seg;
				1967	++gfn;
				1968	}
				1969	return 0;
				1970	}
				1971	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
				1972
				1973	static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
				1974	struct gfn_to_hva_cache *ghc,
				1975	gpa_t gpa, unsigned long len)
				1976	{
				1977	int offset = offset_in_page(gpa);
				1978	gfn_t start_gfn = gpa >> PAGE_SHIFT;
				1979	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
				1980	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
				1981	gfn_t nr_pages_avail;
				1982
				1983	ghc->gpa = gpa;
				1984	ghc->generation = slots->generation;
				1985	ghc->len = len;
				1986	ghc->memslot = __gfn_to_memslot(slots, start_gfn);
				1987	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
				1988	if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
				1989	ghc->hva += offset;
				1990	} else {
				1991	/*
				1992	* If the requested region crosses two memslots, we still
				1993	* verify that the entire region is valid here.
				1994	*/
				1995	while (start_gfn <= end_gfn) {
				1996	nr_pages_avail = 0;
				1997	ghc->memslot = __gfn_to_memslot(slots, start_gfn);
				1998	ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
				1999	&nr_pages_avail);
				2000	if (kvm_is_error_hva(ghc->hva))
				2001	return -EFAULT;
				2002	start_gfn += nr_pages_avail;
				2003	}
				2004	/* Use the slow path for cross page reads and writes. */
				2005	ghc->memslot = NULL;
				2006	}
				2007	return 0;
				2008	}
				2009
				2010	int kvm_gfn_to_hva_cache_init(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2011	gpa_t gpa, unsigned long len)
				2012	{
				2013	struct kvm_memslots *slots = kvm_memslots(kvm);
				2014	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
				2015	}
				2016	EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
				2017
				2018	int kvm_write_guest_offset_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2019	void *data, unsigned int offset,
				2020	unsigned long len)
				2021	{
				2022	struct kvm_memslots *slots = kvm_memslots(kvm);
				2023	int r;
				2024	gpa_t gpa = ghc->gpa + offset;
				2025
				2026	BUG_ON(len + offset > ghc->len);
				2027
				2028	if (slots->generation != ghc->generation)
				2029	__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
				2030
				2031	if (kvm_is_error_hva(ghc->hva))
				2032	return -EFAULT;
				2033
				2034	if (unlikely(!ghc->memslot))
				2035	return kvm_write_guest(kvm, gpa, data, len);
				2036
				2037	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
				2038	if (r)
				2039	return -EFAULT;
				2040	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
				2041
				2042	return 0;
				2043	}
				2044	EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
				2045
				2046	int kvm_write_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2047	void *data, unsigned long len)
				2048	{
				2049	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
				2050	}
				2051	EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
				2052
				2053	int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
				2054	void *data, unsigned long len)
				2055	{
				2056	struct kvm_memslots *slots = kvm_memslots(kvm);
				2057	int r;
				2058
				2059	BUG_ON(len > ghc->len);
				2060
				2061	if (slots->generation != ghc->generation)
				2062	__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
				2063
				2064	if (kvm_is_error_hva(ghc->hva))
				2065	return -EFAULT;
				2066
				2067	if (unlikely(!ghc->memslot))
				2068	return kvm_read_guest(kvm, ghc->gpa, data, len);
				2069
				2070	r = __copy_from_user(data, (void __user *)ghc->hva, len);
				2071	if (r)
				2072	return -EFAULT;
				2073
				2074	return 0;
				2075	}
				2076	EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
				2077
				2078	int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
				2079	{
				2080	const void zero_page = (const void ) __va(page_to_phys(ZERO_PAGE(0)));
				2081
				2082	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
				2083	}
				2084	EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
				2085
				2086	int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
				2087	{
				2088	gfn_t gfn = gpa >> PAGE_SHIFT;
				2089	int seg;
				2090	int offset = offset_in_page(gpa);
				2091	int ret;
				2092
				2093	while ((seg = next_segment(len, offset)) != 0) {
				2094	ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
				2095	if (ret < 0)
				2096	return ret;
				2097	offset = 0;
				2098	len -= seg;
				2099	++gfn;
				2100	}
				2101	return 0;
				2102	}
				2103	EXPORT_SYMBOL_GPL(kvm_clear_guest);
				2104
				2105	static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
				2106	gfn_t gfn)
				2107	{
				2108	if (memslot && memslot->dirty_bitmap) {
				2109	unsigned long rel_gfn = gfn - memslot->base_gfn;
				2110
				2111	set_bit_le(rel_gfn, memslot->dirty_bitmap);
				2112	}
				2113	}
				2114
				2115	void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
				2116	{
				2117	struct kvm_memory_slot *memslot;
				2118
				2119	memslot = gfn_to_memslot(kvm, gfn);
				2120	mark_page_dirty_in_slot(memslot, gfn);
				2121	}
				2122	EXPORT_SYMBOL_GPL(mark_page_dirty);
				2123
				2124	void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
				2125	{
				2126	struct kvm_memory_slot *memslot;
				2127
				2128	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				2129	mark_page_dirty_in_slot(memslot, gfn);
				2130	}
				2131	EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
				2132
				2133	void kvm_sigset_activate(struct kvm_vcpu *vcpu)
				2134	{
				2135	if (!vcpu->sigset_active)
				2136	return;
				2137
				2138	/*
				2139	* This does a lockless modification of ->real_blocked, which is fine
				2140	* because, only current can change ->real_blocked and all readers of
				2141	* ->real_blocked don't care as long ->real_blocked is always a subset
				2142	* of ->blocked.
				2143	*/
				2144	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
				2145	}
				2146
				2147	void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
				2148	{
				2149	if (!vcpu->sigset_active)
				2150	return;
				2151
				2152	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
				2153	sigemptyset(&current->real_blocked);
				2154	}
				2155
				2156	static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
				2157	{
				2158	unsigned int old, val, grow;
				2159
				2160	old = val = vcpu->halt_poll_ns;
				2161	grow = READ_ONCE(halt_poll_ns_grow);
				2162	/* 10us base */
				2163	if (val == 0 && grow)
				2164	val = 10000;
				2165	else
				2166	val *= grow;
				2167
				2168	if (val > halt_poll_ns)
				2169	val = halt_poll_ns;
				2170
				2171	vcpu->halt_poll_ns = val;
				2172	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
				2173	}
				2174
				2175	static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
				2176	{
				2177	unsigned int old, val, shrink;
				2178
				2179	old = val = vcpu->halt_poll_ns;
				2180	shrink = READ_ONCE(halt_poll_ns_shrink);
				2181	if (shrink == 0)
				2182	val = 0;
				2183	else
				2184	val /= shrink;
				2185
				2186	vcpu->halt_poll_ns = val;
				2187	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
				2188	}
				2189
				2190	static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
				2191	{
				2192	if (kvm_arch_vcpu_runnable(vcpu)) {
				2193	kvm_make_request(KVM_REQ_UNHALT, vcpu);
				2194	return -EINTR;
				2195	}
				2196	if (kvm_cpu_has_pending_timer(vcpu))
				2197	return -EINTR;
				2198	if (signal_pending(current))
				2199	return -EINTR;
				2200
				2201	return 0;
				2202	}
				2203
				2204	/*
				2205	* The vCPU has executed a HLT instruction with in-kernel mode enabled.
				2206	*/
				2207	void kvm_vcpu_block(struct kvm_vcpu *vcpu)
				2208	{
				2209	ktime_t start, cur;
				2210	DECLARE_SWAITQUEUE(wait);
				2211	bool waited = false;
				2212	u64 block_ns;
				2213
				2214	start = cur = ktime_get();
				2215	if (vcpu->halt_poll_ns) {
				2216	ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
				2217
				2218	++vcpu->stat.halt_attempted_poll;
				2219	do {
				2220	/*
				2221	* This sets KVM_REQ_UNHALT if an interrupt
				2222	* arrives.
				2223	*/
				2224	if (kvm_vcpu_check_block(vcpu) < 0) {
				2225	++vcpu->stat.halt_successful_poll;
				2226	if (!vcpu_valid_wakeup(vcpu))
				2227	++vcpu->stat.halt_poll_invalid;
				2228	goto out;
				2229	}
				2230	cur = ktime_get();
				2231	} while (single_task_running() && ktime_before(cur, stop));
				2232	}
				2233
				2234	kvm_arch_vcpu_blocking(vcpu);
				2235
				2236	for (;;) {
				2237	prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
				2238
				2239	if (kvm_vcpu_check_block(vcpu) < 0)
				2240	break;
				2241
				2242	waited = true;
				2243	schedule();
				2244	}
				2245
				2246	finish_swait(&vcpu->wq, &wait);
				2247	cur = ktime_get();
				2248
				2249	kvm_arch_vcpu_unblocking(vcpu);
				2250	out:
				2251	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
				2252
				2253	if (!vcpu_valid_wakeup(vcpu))
				2254	shrink_halt_poll_ns(vcpu);
				2255	else if (halt_poll_ns) {
				2256	if (block_ns <= vcpu->halt_poll_ns)
				2257	;
				2258	/* we had a long block, shrink polling */
				2259	else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
				2260	shrink_halt_poll_ns(vcpu);
				2261	/* we had a short halt and our poll time is too small */
				2262	else if (vcpu->halt_poll_ns < halt_poll_ns &&
				2263	block_ns < halt_poll_ns)
				2264	grow_halt_poll_ns(vcpu);
				2265	} else
				2266	vcpu->halt_poll_ns = 0;
				2267
				2268	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
				2269	kvm_arch_vcpu_block_finish(vcpu);
				2270	}
				2271	EXPORT_SYMBOL_GPL(kvm_vcpu_block);
				2272
				2273	bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
				2274	{
				2275	struct swait_queue_head *wqp;
				2276
				2277	wqp = kvm_arch_vcpu_wq(vcpu);
				2278	if (swq_has_sleeper(wqp)) {
				2279	swake_up(wqp);
				2280	++vcpu->stat.halt_wakeup;
				2281	return true;
				2282	}
				2283
				2284	return false;
				2285	}
				2286	EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
				2287
				2288	#ifndef CONFIG_S390
				2289	/*
				2290	* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
				2291	*/
				2292	void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
				2293	{
				2294	int me;
				2295	int cpu = vcpu->cpu;
				2296
				2297	if (kvm_vcpu_wake_up(vcpu))
				2298	return;
				2299
				2300	me = get_cpu();
				2301	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
				2302	if (kvm_arch_vcpu_should_kick(vcpu))
				2303	smp_send_reschedule(cpu);
				2304	put_cpu();
				2305	}
				2306	EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
				2307	#endif /* !CONFIG_S390 */
				2308
				2309	int kvm_vcpu_yield_to(struct kvm_vcpu *target)
				2310	{
				2311	struct pid *pid;
				2312	struct task_struct *task = NULL;
				2313	int ret = 0;
				2314
				2315	rcu_read_lock();
				2316	pid = rcu_dereference(target->pid);
				2317	if (pid)
				2318	task = get_pid_task(pid, PIDTYPE_PID);
				2319	rcu_read_unlock();
				2320	if (!task)
				2321	return ret;
				2322	ret = yield_to(task, 1);
				2323	put_task_struct(task);
				2324
				2325	return ret;
				2326	}
				2327	EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
				2328
				2329	/*
				2330	* Helper that checks whether a VCPU is eligible for directed yield.
				2331	* Most eligible candidate to yield is decided by following heuristics:
				2332	*
				2333	* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
				2334	* (preempted lock holder), indicated by @in_spin_loop.
				2335	* Set at the beiginning and cleared at the end of interception/PLE handler.
				2336	*
				2337	* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
				2338	* chance last time (mostly it has become eligible now since we have probably
				2339	* yielded to lockholder in last iteration. This is done by toggling
				2340	* @dy_eligible each time a VCPU checked for eligibility.)
				2341	*
				2342	* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
				2343	* to preempted lock-holder could result in wrong VCPU selection and CPU
				2344	* burning. Giving priority for a potential lock-holder increases lock
				2345	* progress.
				2346	*
				2347	* Since algorithm is based on heuristics, accessing another VCPU data without
				2348	* locking does not harm. It may result in trying to yield to same VCPU, fail
				2349	* and continue with next VCPU and so on.
				2350	*/
				2351	static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
				2352	{
				2353	#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
				2354	bool eligible;
				2355
				2356	eligible = !vcpu->spin_loop.in_spin_loop \|\|
				2357	vcpu->spin_loop.dy_eligible;
				2358
				2359	if (vcpu->spin_loop.in_spin_loop)
				2360	kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
				2361
				2362	return eligible;
				2363	#else
				2364	return true;
				2365	#endif
				2366	}
				2367
				2368	/*
				2369	* Unlike kvm_arch_vcpu_runnable, this function is called outside
				2370	* a vcpu_load/vcpu_put pair. However, for most architectures
				2371	* kvm_arch_vcpu_runnable does not require vcpu_load.
				2372	*/
				2373	bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
				2374	{
				2375	return kvm_arch_vcpu_runnable(vcpu);
				2376	}
				2377
				2378	static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
				2379	{
				2380	if (kvm_arch_dy_runnable(vcpu))
				2381	return true;
				2382
				2383	#ifdef CONFIG_KVM_ASYNC_PF
				2384	if (!list_empty_careful(&vcpu->async_pf.done))
				2385	return true;
				2386	#endif
				2387
				2388	return false;
				2389	}
				2390
				2391	void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
				2392	{
				2393	struct kvm *kvm = me->kvm;
				2394	struct kvm_vcpu *vcpu;
				2395	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
				2396	int yielded = 0;
				2397	int try = 3;
				2398	int pass;
				2399	int i;
				2400
				2401	kvm_vcpu_set_in_spin_loop(me, true);
				2402	/*
				2403	* We boost the priority of a VCPU that is runnable but not
				2404	* currently running, because it got preempted by something
				2405	* else and called schedule in __vcpu_run. Hopefully that
				2406	* VCPU is holding the lock that we need and will release it.
				2407	* We approximate round-robin by starting at the last boosted VCPU.
				2408	*/
				2409	for (pass = 0; pass < 2 && !yielded && try; pass++) {
				2410	kvm_for_each_vcpu(i, vcpu, kvm) {
				2411	if (!pass && i <= last_boosted_vcpu) {
				2412	i = last_boosted_vcpu;
				2413	continue;
				2414	} else if (pass && i > last_boosted_vcpu)
				2415	break;
				2416	if (!ACCESS_ONCE(vcpu->preempted))
				2417	continue;
				2418	if (vcpu == me)
				2419	continue;
				2420	if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
				2421	continue;
				2422	if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
				2423	continue;
				2424	if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
				2425	continue;
				2426
				2427	yielded = kvm_vcpu_yield_to(vcpu);
				2428	if (yielded > 0) {
				2429	kvm->last_boosted_vcpu = i;
				2430	break;
				2431	} else if (yielded < 0) {
				2432	try--;
				2433	if (!try)
				2434	break;
				2435	}
				2436	}
				2437	}
				2438	kvm_vcpu_set_in_spin_loop(me, false);
				2439
				2440	/* Ensure vcpu is not eligible during next spinloop */
				2441	kvm_vcpu_set_dy_eligible(me, false);
				2442	}
				2443	EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
				2444
				2445	static int kvm_vcpu_fault(struct vm_fault *vmf)
				2446	{
				2447	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
				2448	struct page *page;
				2449
				2450	if (vmf->pgoff == 0)
				2451	page = virt_to_page(vcpu->run);
				2452	#ifdef CONFIG_X86
				2453	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
				2454	page = virt_to_page(vcpu->arch.pio_data);
				2455	#endif
				2456	#ifdef CONFIG_KVM_MMIO
				2457	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
				2458	page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
				2459	#endif
				2460	else
				2461	return kvm_arch_vcpu_fault(vcpu, vmf);
				2462	get_page(page);
				2463	vmf->page = page;
				2464	return 0;
				2465	}
				2466
				2467	static const struct vm_operations_struct kvm_vcpu_vm_ops = {
				2468	.fault = kvm_vcpu_fault,
				2469	};
				2470
				2471	static int kvm_vcpu_mmap(struct file file, struct vm_area_struct vma)
				2472	{
				2473	vma->vm_ops = &kvm_vcpu_vm_ops;
				2474	return 0;
				2475	}
				2476
				2477	static int kvm_vcpu_release(struct inode inode, struct file filp)
				2478	{
				2479	struct kvm_vcpu *vcpu = filp->private_data;
				2480
				2481	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2482	kvm_put_kvm(vcpu->kvm);
				2483	return 0;
				2484	}
				2485
				2486	static struct file_operations kvm_vcpu_fops = {
				2487	.release = kvm_vcpu_release,
				2488	.unlocked_ioctl = kvm_vcpu_ioctl,
				2489	#ifdef CONFIG_KVM_COMPAT
				2490	.compat_ioctl = kvm_vcpu_compat_ioctl,
				2491	#endif
				2492	.mmap = kvm_vcpu_mmap,
				2493	.llseek = noop_llseek,
				2494	};
				2495
				2496	/*
				2497	* Allocates an inode for the vcpu.
				2498	*/
				2499	static int create_vcpu_fd(struct kvm_vcpu *vcpu)
				2500	{
				2501	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR \| O_CLOEXEC);
				2502	}
				2503
				2504	static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
				2505	{
				2506	char dir_name[ITOA_MAX_LEN * 2];
				2507	int ret;
				2508
				2509	if (!kvm_arch_has_vcpu_debugfs())
				2510	return 0;
				2511
				2512	if (!debugfs_initialized())
				2513	return 0;
				2514
				2515	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
				2516	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
				2517	vcpu->kvm->debugfs_dentry);
				2518	if (!vcpu->debugfs_dentry)
				2519	return -ENOMEM;
				2520
				2521	ret = kvm_arch_create_vcpu_debugfs(vcpu);
				2522	if (ret < 0) {
				2523	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2524	return ret;
				2525	}
				2526
				2527	return 0;
				2528	}
				2529
				2530	/*
				2531	* Creates some virtual cpus. Good luck creating more than one.
				2532	*/
				2533	static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
				2534	{
				2535	int r;
				2536	struct kvm_vcpu *vcpu;
				2537
				2538	if (id >= KVM_MAX_VCPU_ID)
				2539	return -EINVAL;
				2540
				2541	mutex_lock(&kvm->lock);
				2542	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
				2543	mutex_unlock(&kvm->lock);
				2544	return -EINVAL;
				2545	}
				2546
				2547	kvm->created_vcpus++;
				2548	mutex_unlock(&kvm->lock);
				2549
				2550	vcpu = kvm_arch_vcpu_create(kvm, id);
				2551	if (IS_ERR(vcpu)) {
				2552	r = PTR_ERR(vcpu);
				2553	goto vcpu_decrement;
				2554	}
				2555
				2556	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
				2557
				2558	r = kvm_arch_vcpu_setup(vcpu);
				2559	if (r)
				2560	goto vcpu_destroy;
				2561
				2562	r = kvm_create_vcpu_debugfs(vcpu);
				2563	if (r)
				2564	goto vcpu_destroy;
				2565
				2566	mutex_lock(&kvm->lock);
				2567	if (kvm_get_vcpu_by_id(kvm, id)) {
				2568	r = -EEXIST;
				2569	goto unlock_vcpu_destroy;
				2570	}
				2571
				2572	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
				2573
				2574	/* Now it's all set up, let userspace reach it */
				2575	kvm_get_kvm(kvm);
				2576	r = create_vcpu_fd(vcpu);
				2577	if (r < 0) {
				2578	kvm_put_kvm(kvm);
				2579	goto unlock_vcpu_destroy;
				2580	}
				2581
				2582	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
				2583
				2584	/*
				2585	* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
				2586	* before kvm->online_vcpu's incremented value.
				2587	*/
				2588	smp_wmb();
				2589	atomic_inc(&kvm->online_vcpus);
				2590
				2591	mutex_unlock(&kvm->lock);
				2592	kvm_arch_vcpu_postcreate(vcpu);
				2593	return r;
				2594
				2595	unlock_vcpu_destroy:
				2596	mutex_unlock(&kvm->lock);
				2597	debugfs_remove_recursive(vcpu->debugfs_dentry);
				2598	vcpu_destroy:
				2599	kvm_arch_vcpu_destroy(vcpu);
				2600	vcpu_decrement:
				2601	mutex_lock(&kvm->lock);
				2602	kvm->created_vcpus--;
				2603	mutex_unlock(&kvm->lock);
				2604	return r;
				2605	}
				2606
				2607	static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu vcpu, sigset_t sigset)
				2608	{
				2609	if (sigset) {
				2610	sigdelsetmask(sigset, sigmask(SIGKILL)\|sigmask(SIGSTOP));
				2611	vcpu->sigset_active = 1;
				2612	vcpu->sigset = *sigset;
				2613	} else
				2614	vcpu->sigset_active = 0;
				2615	return 0;
				2616	}
				2617
				2618	static long kvm_vcpu_ioctl(struct file *filp,
				2619	unsigned int ioctl, unsigned long arg)
				2620	{
				2621	struct kvm_vcpu *vcpu = filp->private_data;
				2622	void __user argp = (void __user )arg;
				2623	int r;
				2624	struct kvm_fpu *fpu = NULL;
				2625	struct kvm_sregs *kvm_sregs = NULL;
				2626
				2627	if (vcpu->kvm->mm != current->mm)
				2628	return -EIO;
				2629
				2630	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
				2631	return -EINVAL;
				2632
				2633	#if defined(CONFIG_S390) \|\| defined(CONFIG_PPC) \|\| defined(CONFIG_MIPS)
				2634	/*
				2635	* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
				2636	* so vcpu_load() would break it.
				2637	*/
				2638	if (ioctl == KVM_S390_INTERRUPT \|\| ioctl == KVM_S390_IRQ \|\| ioctl == KVM_INTERRUPT)
				2639	return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
				2640	#endif
				2641
				2642
				2643	r = vcpu_load(vcpu);
				2644	if (r)
				2645	return r;
				2646	switch (ioctl) {
				2647	case KVM_RUN: {
				2648	struct pid *oldpid;
				2649	r = -EINVAL;
				2650	if (arg)
				2651	goto out;
				2652	oldpid = rcu_access_pointer(vcpu->pid);
				2653	if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) {
				2654	/* The thread running this VCPU changed. */
				2655	struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
				2656
				2657	rcu_assign_pointer(vcpu->pid, newpid);
				2658	if (oldpid)
				2659	synchronize_rcu();
				2660	put_pid(oldpid);
				2661	}
				2662	r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
				2663	trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
				2664	break;
				2665	}
				2666	case KVM_GET_REGS: {
				2667	struct kvm_regs *kvm_regs;
				2668
				2669	r = -ENOMEM;
				2670	kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
				2671	if (!kvm_regs)
				2672	goto out;
				2673	r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
				2674	if (r)
				2675	goto out_free1;
				2676	r = -EFAULT;
				2677	if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
				2678	goto out_free1;
				2679	r = 0;
				2680	out_free1:
				2681	kfree(kvm_regs);
				2682	break;
				2683	}
				2684	case KVM_SET_REGS: {
				2685	struct kvm_regs *kvm_regs;
				2686
				2687	r = -ENOMEM;
				2688	kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
				2689	if (IS_ERR(kvm_regs)) {
				2690	r = PTR_ERR(kvm_regs);
				2691	goto out;
				2692	}
				2693	r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
				2694	kfree(kvm_regs);
				2695	break;
				2696	}
				2697	case KVM_GET_SREGS: {
				2698	kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
				2699	r = -ENOMEM;
				2700	if (!kvm_sregs)
				2701	goto out;
				2702	r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
				2703	if (r)
				2704	goto out;
				2705	r = -EFAULT;
				2706	if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
				2707	goto out;
				2708	r = 0;
				2709	break;
				2710	}
				2711	case KVM_SET_SREGS: {
				2712	kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
				2713	if (IS_ERR(kvm_sregs)) {
				2714	r = PTR_ERR(kvm_sregs);
				2715	kvm_sregs = NULL;
				2716	goto out;
				2717	}
				2718	r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
				2719	break;
				2720	}
				2721	case KVM_GET_MP_STATE: {
				2722	struct kvm_mp_state mp_state;
				2723
				2724	r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
				2725	if (r)
				2726	goto out;
				2727	r = -EFAULT;
				2728	if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
				2729	goto out;
				2730	r = 0;
				2731	break;
				2732	}
				2733	case KVM_SET_MP_STATE: {
				2734	struct kvm_mp_state mp_state;
				2735
				2736	r = -EFAULT;
				2737	if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
				2738	goto out;
				2739	r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
				2740	break;
				2741	}
				2742	case KVM_TRANSLATE: {
				2743	struct kvm_translation tr;
				2744
				2745	r = -EFAULT;
				2746	if (copy_from_user(&tr, argp, sizeof(tr)))
				2747	goto out;
				2748	r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
				2749	if (r)
				2750	goto out;
				2751	r = -EFAULT;
				2752	if (copy_to_user(argp, &tr, sizeof(tr)))
				2753	goto out;
				2754	r = 0;
				2755	break;
				2756	}
				2757	case KVM_SET_GUEST_DEBUG: {
				2758	struct kvm_guest_debug dbg;
				2759
				2760	r = -EFAULT;
				2761	if (copy_from_user(&dbg, argp, sizeof(dbg)))
				2762	goto out;
				2763	r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
				2764	break;
				2765	}
				2766	case KVM_SET_SIGNAL_MASK: {
				2767	struct kvm_signal_mask __user *sigmask_arg = argp;
				2768	struct kvm_signal_mask kvm_sigmask;
				2769	sigset_t sigset, *p;
				2770
				2771	p = NULL;
				2772	if (argp) {
				2773	r = -EFAULT;
				2774	if (copy_from_user(&kvm_sigmask, argp,
				2775	sizeof(kvm_sigmask)))
				2776	goto out;
				2777	r = -EINVAL;
				2778	if (kvm_sigmask.len != sizeof(sigset))
				2779	goto out;
				2780	r = -EFAULT;
				2781	if (copy_from_user(&sigset, sigmask_arg->sigset,
				2782	sizeof(sigset)))
				2783	goto out;
				2784	p = &sigset;
				2785	}
				2786	r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
				2787	break;
				2788	}
				2789	case KVM_GET_FPU: {
				2790	fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
				2791	r = -ENOMEM;
				2792	if (!fpu)
				2793	goto out;
				2794	r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
				2795	if (r)
				2796	goto out;
				2797	r = -EFAULT;
				2798	if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
				2799	goto out;
				2800	r = 0;
				2801	break;
				2802	}
				2803	case KVM_SET_FPU: {
				2804	fpu = memdup_user(argp, sizeof(*fpu));
				2805	if (IS_ERR(fpu)) {
				2806	r = PTR_ERR(fpu);
				2807	fpu = NULL;
				2808	goto out;
				2809	}
				2810	r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
				2811	break;
				2812	}
				2813	default:
				2814	r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
				2815	}
				2816	out:
				2817	vcpu_put(vcpu);
				2818	kfree(fpu);
				2819	kfree(kvm_sregs);
				2820	return r;
				2821	}
				2822
				2823	#ifdef CONFIG_KVM_COMPAT
				2824	static long kvm_vcpu_compat_ioctl(struct file *filp,
				2825	unsigned int ioctl, unsigned long arg)
				2826	{
				2827	struct kvm_vcpu *vcpu = filp->private_data;
				2828	void __user *argp = compat_ptr(arg);
				2829	int r;
				2830
				2831	if (vcpu->kvm->mm != current->mm)
				2832	return -EIO;
				2833
				2834	switch (ioctl) {
				2835	case KVM_SET_SIGNAL_MASK: {
				2836	struct kvm_signal_mask __user *sigmask_arg = argp;
				2837	struct kvm_signal_mask kvm_sigmask;
				2838	compat_sigset_t csigset;
				2839	sigset_t sigset;
				2840
				2841	if (argp) {
				2842	r = -EFAULT;
				2843	if (copy_from_user(&kvm_sigmask, argp,
				2844	sizeof(kvm_sigmask)))
				2845	goto out;
				2846	r = -EINVAL;
				2847	if (kvm_sigmask.len != sizeof(csigset))
				2848	goto out;
				2849	r = -EFAULT;
				2850	if (copy_from_user(&csigset, sigmask_arg->sigset,
				2851	sizeof(csigset)))
				2852	goto out;
				2853	sigset_from_compat(&sigset, &csigset);
				2854	r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
				2855	} else
				2856	r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
				2857	break;
				2858	}
				2859	default:
				2860	r = kvm_vcpu_ioctl(filp, ioctl, arg);
				2861	}
				2862
				2863	out:
				2864	return r;
				2865	}
				2866	#endif
				2867
				2868	static int kvm_device_ioctl_attr(struct kvm_device *dev,
				2869	int (accessor)(struct kvm_device dev,
				2870	struct kvm_device_attr *attr),
				2871	unsigned long arg)
				2872	{
				2873	struct kvm_device_attr attr;
				2874
				2875	if (!accessor)
				2876	return -EPERM;
				2877
				2878	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
				2879	return -EFAULT;
				2880
				2881	return accessor(dev, &attr);
				2882	}
				2883
				2884	static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
				2885	unsigned long arg)
				2886	{
				2887	struct kvm_device *dev = filp->private_data;
				2888
				2889	if (dev->kvm->mm != current->mm)
				2890	return -EIO;
				2891
				2892	switch (ioctl) {
				2893	case KVM_SET_DEVICE_ATTR:
				2894	return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
				2895	case KVM_GET_DEVICE_ATTR:
				2896	return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
				2897	case KVM_HAS_DEVICE_ATTR:
				2898	return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
				2899	default:
				2900	if (dev->ops->ioctl)
				2901	return dev->ops->ioctl(dev, ioctl, arg);
				2902
				2903	return -ENOTTY;
				2904	}
				2905	}
				2906
				2907	static int kvm_device_release(struct inode inode, struct file filp)
				2908	{
				2909	struct kvm_device *dev = filp->private_data;
				2910	struct kvm *kvm = dev->kvm;
				2911
				2912	kvm_put_kvm(kvm);
				2913	return 0;
				2914	}
				2915
				2916	static const struct file_operations kvm_device_fops = {
				2917	.unlocked_ioctl = kvm_device_ioctl,
				2918	#ifdef CONFIG_KVM_COMPAT
				2919	.compat_ioctl = kvm_device_ioctl,
				2920	#endif
				2921	.release = kvm_device_release,
				2922	};
				2923
				2924	struct kvm_device kvm_device_from_filp(struct file filp)
				2925	{
				2926	if (filp->f_op != &kvm_device_fops)
				2927	return NULL;
				2928
				2929	return filp->private_data;
				2930	}
				2931
				2932	static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
				2933	#ifdef CONFIG_KVM_MPIC
				2934	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
				2935	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
				2936	#endif
				2937	};
				2938
				2939	int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
				2940	{
				2941	if (type >= ARRAY_SIZE(kvm_device_ops_table))
				2942	return -ENOSPC;
				2943
				2944	if (kvm_device_ops_table[type] != NULL)
				2945	return -EEXIST;
				2946
				2947	kvm_device_ops_table[type] = ops;
				2948	return 0;
				2949	}
				2950
				2951	void kvm_unregister_device_ops(u32 type)
				2952	{
				2953	if (kvm_device_ops_table[type] != NULL)
				2954	kvm_device_ops_table[type] = NULL;
				2955	}
				2956
				2957	static int kvm_ioctl_create_device(struct kvm *kvm,
				2958	struct kvm_create_device *cd)
				2959	{
				2960	struct kvm_device_ops *ops = NULL;
				2961	struct kvm_device *dev;
				2962	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
				2963	int type;
				2964	int ret;
				2965
				2966	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
				2967	return -ENODEV;
				2968
				2969	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
				2970	ops = kvm_device_ops_table[type];
				2971	if (ops == NULL)
				2972	return -ENODEV;
				2973
				2974	if (test)
				2975	return 0;
				2976
				2977	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
				2978	if (!dev)
				2979	return -ENOMEM;
				2980
				2981	dev->ops = ops;
				2982	dev->kvm = kvm;
				2983
				2984	mutex_lock(&kvm->lock);
				2985	ret = ops->create(dev, type);
				2986	if (ret < 0) {
				2987	mutex_unlock(&kvm->lock);
				2988	kfree(dev);
				2989	return ret;
				2990	}
				2991	list_add(&dev->vm_node, &kvm->devices);
				2992	mutex_unlock(&kvm->lock);
				2993
				2994	if (ops->init)
				2995	ops->init(dev);
				2996
				2997	kvm_get_kvm(kvm);
				2998	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR \| O_CLOEXEC);
				2999	if (ret < 0) {
				3000	kvm_put_kvm(kvm);
				3001	mutex_lock(&kvm->lock);
				3002	list_del(&dev->vm_node);
				3003	mutex_unlock(&kvm->lock);
				3004	ops->destroy(dev);
				3005	return ret;
				3006	}
				3007
				3008	cd->fd = ret;
				3009	return 0;
				3010	}
				3011
				3012	static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
				3013	{
				3014	switch (arg) {
				3015	case KVM_CAP_USER_MEMORY:
				3016	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
				3017	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
				3018	case KVM_CAP_INTERNAL_ERROR_DATA:
				3019	#ifdef CONFIG_HAVE_KVM_MSI
				3020	case KVM_CAP_SIGNAL_MSI:
				3021	#endif
				3022	#ifdef CONFIG_HAVE_KVM_IRQFD
				3023	case KVM_CAP_IRQFD:
				3024	case KVM_CAP_IRQFD_RESAMPLE:
				3025	#endif
				3026	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
				3027	case KVM_CAP_CHECK_EXTENSION_VM:
				3028	return 1;
				3029	#ifdef CONFIG_KVM_MMIO
				3030	case KVM_CAP_COALESCED_MMIO:
				3031	return KVM_COALESCED_MMIO_PAGE_OFFSET;
				3032	#endif
				3033	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
				3034	case KVM_CAP_IRQ_ROUTING:
				3035	return KVM_MAX_IRQ_ROUTES;
				3036	#endif
				3037	#if KVM_ADDRESS_SPACE_NUM > 1
				3038	case KVM_CAP_MULTI_ADDRESS_SPACE:
				3039	return KVM_ADDRESS_SPACE_NUM;
				3040	#endif
				3041	default:
				3042	break;
				3043	}
				3044	return kvm_vm_ioctl_check_extension(kvm, arg);
				3045	}
				3046
				3047	static long kvm_vm_ioctl(struct file *filp,
				3048	unsigned int ioctl, unsigned long arg)
				3049	{
				3050	struct kvm *kvm = filp->private_data;
				3051	void __user argp = (void __user )arg;
				3052	int r;
				3053
				3054	if (kvm->mm != current->mm)
				3055	return -EIO;
				3056	switch (ioctl) {
				3057	case KVM_CREATE_VCPU:
				3058	r = kvm_vm_ioctl_create_vcpu(kvm, arg);
				3059	break;
				3060	case KVM_SET_USER_MEMORY_REGION: {
				3061	struct kvm_userspace_memory_region kvm_userspace_mem;
				3062
				3063	r = -EFAULT;
				3064	if (copy_from_user(&kvm_userspace_mem, argp,
				3065	sizeof(kvm_userspace_mem)))
				3066	goto out;
				3067
				3068	r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
				3069	break;
				3070	}
				3071	case KVM_GET_DIRTY_LOG: {
				3072	struct kvm_dirty_log log;
				3073
				3074	r = -EFAULT;
				3075	if (copy_from_user(&log, argp, sizeof(log)))
				3076	goto out;
				3077	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
				3078	break;
				3079	}
				3080	#ifdef CONFIG_KVM_MMIO
				3081	case KVM_REGISTER_COALESCED_MMIO: {
				3082	struct kvm_coalesced_mmio_zone zone;
				3083
				3084	r = -EFAULT;
				3085	if (copy_from_user(&zone, argp, sizeof(zone)))
				3086	goto out;
				3087	r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
				3088	break;
				3089	}
				3090	case KVM_UNREGISTER_COALESCED_MMIO: {
				3091	struct kvm_coalesced_mmio_zone zone;
				3092
				3093	r = -EFAULT;
				3094	if (copy_from_user(&zone, argp, sizeof(zone)))
				3095	goto out;
				3096	r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
				3097	break;
				3098	}
				3099	#endif
				3100	case KVM_IRQFD: {
				3101	struct kvm_irqfd data;
				3102
				3103	r = -EFAULT;
				3104	if (copy_from_user(&data, argp, sizeof(data)))
				3105	goto out;
				3106	r = kvm_irqfd(kvm, &data);
				3107	break;
				3108	}
				3109	case KVM_IOEVENTFD: {
				3110	struct kvm_ioeventfd data;
				3111
				3112	r = -EFAULT;
				3113	if (copy_from_user(&data, argp, sizeof(data)))
				3114	goto out;
				3115	r = kvm_ioeventfd(kvm, &data);
				3116	break;
				3117	}
				3118	#ifdef CONFIG_HAVE_KVM_MSI
				3119	case KVM_SIGNAL_MSI: {
				3120	struct kvm_msi msi;
				3121
				3122	r = -EFAULT;
				3123	if (copy_from_user(&msi, argp, sizeof(msi)))
				3124	goto out;
				3125	r = kvm_send_userspace_msi(kvm, &msi);
				3126	break;
				3127	}
				3128	#endif
				3129	#ifdef __KVM_HAVE_IRQ_LINE
				3130	case KVM_IRQ_LINE_STATUS:
				3131	case KVM_IRQ_LINE: {
				3132	struct kvm_irq_level irq_event;
				3133
				3134	r = -EFAULT;
				3135	if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
				3136	goto out;
				3137
				3138	r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
				3139	ioctl == KVM_IRQ_LINE_STATUS);
				3140	if (r)
				3141	goto out;
				3142
				3143	r = -EFAULT;
				3144	if (ioctl == KVM_IRQ_LINE_STATUS) {
				3145	if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
				3146	goto out;
				3147	}
				3148
				3149	r = 0;
				3150	break;
				3151	}
				3152	#endif
				3153	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
				3154	case KVM_SET_GSI_ROUTING: {
				3155	struct kvm_irq_routing routing;
				3156	struct kvm_irq_routing __user *urouting;
				3157	struct kvm_irq_routing_entry *entries = NULL;
				3158
				3159	r = -EFAULT;
				3160	if (copy_from_user(&routing, argp, sizeof(routing)))
				3161	goto out;
				3162	r = -EINVAL;
				3163	if (!kvm_arch_can_set_irq_routing(kvm))
				3164	goto out;
				3165	if (routing.nr > KVM_MAX_IRQ_ROUTES)
				3166	goto out;
				3167	if (routing.flags)
				3168	goto out;
				3169	if (routing.nr) {
				3170	r = -ENOMEM;
				3171	entries = vmalloc(routing.nr * sizeof(*entries));
				3172	if (!entries)
				3173	goto out;
				3174	r = -EFAULT;
				3175	urouting = argp;
				3176	if (copy_from_user(entries, urouting->entries,
				3177	routing.nr * sizeof(*entries)))
				3178	goto out_free_irq_routing;
				3179	}
				3180	r = kvm_set_irq_routing(kvm, entries, routing.nr,
				3181	routing.flags);
				3182	out_free_irq_routing:
				3183	vfree(entries);
				3184	break;
				3185	}
				3186	#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
				3187	case KVM_CREATE_DEVICE: {
				3188	struct kvm_create_device cd;
				3189
				3190	r = -EFAULT;
				3191	if (copy_from_user(&cd, argp, sizeof(cd)))
				3192	goto out;
				3193
				3194	r = kvm_ioctl_create_device(kvm, &cd);
				3195	if (r)
				3196	goto out;
				3197
				3198	r = -EFAULT;
				3199	if (copy_to_user(argp, &cd, sizeof(cd)))
				3200	goto out;
				3201
				3202	r = 0;
				3203	break;
				3204	}
				3205	case KVM_CHECK_EXTENSION:
				3206	r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
				3207	break;
				3208	default:
				3209	r = kvm_arch_vm_ioctl(filp, ioctl, arg);
				3210	}
				3211	out:
				3212	return r;
				3213	}
				3214
				3215	#ifdef CONFIG_KVM_COMPAT
				3216	struct compat_kvm_dirty_log {
				3217	__u32 slot;
				3218	__u32 padding1;
				3219	union {
				3220	compat_uptr_t dirty_bitmap; /* one bit per page */
				3221	__u64 padding2;
				3222	};
				3223	};
				3224
				3225	static long kvm_vm_compat_ioctl(struct file *filp,
				3226	unsigned int ioctl, unsigned long arg)
				3227	{
				3228	struct kvm *kvm = filp->private_data;
				3229	int r;
				3230
				3231	if (kvm->mm != current->mm)
				3232	return -EIO;
				3233	switch (ioctl) {
				3234	case KVM_GET_DIRTY_LOG: {
				3235	struct compat_kvm_dirty_log compat_log;
				3236	struct kvm_dirty_log log;
				3237
				3238	if (copy_from_user(&compat_log, (void __user *)arg,
				3239	sizeof(compat_log)))
				3240	return -EFAULT;
				3241	log.slot = compat_log.slot;
				3242	log.padding1 = compat_log.padding1;
				3243	log.padding2 = compat_log.padding2;
				3244	log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
				3245
				3246	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
				3247	break;
				3248	}
				3249	default:
				3250	r = kvm_vm_ioctl(filp, ioctl, arg);
				3251	}
				3252	return r;
				3253	}
				3254	#endif
				3255
				3256	static struct file_operations kvm_vm_fops = {
				3257	.release = kvm_vm_release,
				3258	.unlocked_ioctl = kvm_vm_ioctl,
				3259	#ifdef CONFIG_KVM_COMPAT
				3260	.compat_ioctl = kvm_vm_compat_ioctl,
				3261	#endif
				3262	.llseek = noop_llseek,
				3263	};
				3264
				3265	static int kvm_dev_ioctl_create_vm(unsigned long type)
				3266	{
				3267	int r;
				3268	struct kvm *kvm;
				3269	struct file *file;
				3270
				3271	kvm = kvm_create_vm(type);
				3272	if (IS_ERR(kvm))
				3273	return PTR_ERR(kvm);
				3274	#ifdef CONFIG_KVM_MMIO
				3275	r = kvm_coalesced_mmio_init(kvm);
				3276	if (r < 0) {
				3277	kvm_put_kvm(kvm);
				3278	return r;
				3279	}
				3280	#endif
				3281	r = get_unused_fd_flags(O_CLOEXEC);
				3282	if (r < 0) {
				3283	kvm_put_kvm(kvm);
				3284	return r;
				3285	}
				3286	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
				3287	if (IS_ERR(file)) {
				3288	put_unused_fd(r);
				3289	kvm_put_kvm(kvm);
				3290	return PTR_ERR(file);
				3291	}
				3292
				3293	/*
				3294	* Don't call kvm_put_kvm anymore at this point; file->f_op is
				3295	* already set, with ->release() being kvm_vm_release(). In error
				3296	* cases it will be called by the final fput(file) and will take
				3297	* care of doing kvm_put_kvm(kvm).
				3298	*/
				3299	if (kvm_create_vm_debugfs(kvm, r) < 0) {
				3300	put_unused_fd(r);
				3301	fput(file);
				3302	return -ENOMEM;
				3303	}
				3304	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
				3305
				3306	fd_install(r, file);
				3307	return r;
				3308	}
				3309
				3310	static long kvm_dev_ioctl(struct file *filp,
				3311	unsigned int ioctl, unsigned long arg)
				3312	{
				3313	long r = -EINVAL;
				3314
				3315	switch (ioctl) {
				3316	case KVM_GET_API_VERSION:
				3317	if (arg)
				3318	goto out;
				3319	r = KVM_API_VERSION;
				3320	break;
				3321	case KVM_CREATE_VM:
				3322	r = kvm_dev_ioctl_create_vm(arg);
				3323	break;
				3324	case KVM_CHECK_EXTENSION:
				3325	r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
				3326	break;
				3327	case KVM_GET_VCPU_MMAP_SIZE:
				3328	if (arg)
				3329	goto out;
				3330	r = PAGE_SIZE; /* struct kvm_run */
				3331	#ifdef CONFIG_X86
				3332	r += PAGE_SIZE; /* pio data page */
				3333	#endif
				3334	#ifdef CONFIG_KVM_MMIO
				3335	r += PAGE_SIZE; /* coalesced mmio ring page */
				3336	#endif
				3337	break;
				3338	case KVM_TRACE_ENABLE:
				3339	case KVM_TRACE_PAUSE:
				3340	case KVM_TRACE_DISABLE:
				3341	r = -EOPNOTSUPP;
				3342	break;
				3343	default:
				3344	return kvm_arch_dev_ioctl(filp, ioctl, arg);
				3345	}
				3346	out:
				3347	return r;
				3348	}
				3349
				3350	static struct file_operations kvm_chardev_ops = {
				3351	.unlocked_ioctl = kvm_dev_ioctl,
				3352	.compat_ioctl = kvm_dev_ioctl,
				3353	.llseek = noop_llseek,
				3354	};
				3355
				3356	static struct miscdevice kvm_dev = {
				3357	KVM_MINOR,
				3358	"kvm",
				3359	&kvm_chardev_ops,
				3360	};
				3361
				3362	static void hardware_enable_nolock(void *junk)
				3363	{
				3364	int cpu = raw_smp_processor_id();
				3365	int r;
				3366
				3367	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
				3368	return;
				3369
				3370	cpumask_set_cpu(cpu, cpus_hardware_enabled);
				3371
				3372	r = kvm_arch_hardware_enable();
				3373
				3374	if (r) {
				3375	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
				3376	atomic_inc(&hardware_enable_failed);
				3377	pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
				3378	}
				3379	}
				3380
				3381	static int kvm_starting_cpu(unsigned int cpu)
				3382	{
				3383	raw_spin_lock(&kvm_count_lock);
				3384	if (kvm_usage_count)
				3385	hardware_enable_nolock(NULL);
				3386	raw_spin_unlock(&kvm_count_lock);
				3387	return 0;
				3388	}
				3389
				3390	static void hardware_disable_nolock(void *junk)
				3391	{
				3392	int cpu = raw_smp_processor_id();
				3393
				3394	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
				3395	return;
				3396	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
				3397	kvm_arch_hardware_disable();
				3398	}
				3399
				3400	static int kvm_dying_cpu(unsigned int cpu)
				3401	{
				3402	raw_spin_lock(&kvm_count_lock);
				3403	if (kvm_usage_count)
				3404	hardware_disable_nolock(NULL);
				3405	raw_spin_unlock(&kvm_count_lock);
				3406	return 0;
				3407	}
				3408
				3409	static void hardware_disable_all_nolock(void)
				3410	{
				3411	BUG_ON(!kvm_usage_count);
				3412
				3413	kvm_usage_count--;
				3414	if (!kvm_usage_count)
				3415	on_each_cpu(hardware_disable_nolock, NULL, 1);
				3416	}
				3417
				3418	static void hardware_disable_all(void)
				3419	{
				3420	raw_spin_lock(&kvm_count_lock);
				3421	hardware_disable_all_nolock();
				3422	raw_spin_unlock(&kvm_count_lock);
				3423	}
				3424
				3425	static int hardware_enable_all(void)
				3426	{
				3427	int r = 0;
				3428
				3429	raw_spin_lock(&kvm_count_lock);
				3430
				3431	kvm_usage_count++;
				3432	if (kvm_usage_count == 1) {
				3433	atomic_set(&hardware_enable_failed, 0);
				3434	on_each_cpu(hardware_enable_nolock, NULL, 1);
				3435
				3436	if (atomic_read(&hardware_enable_failed)) {
				3437	hardware_disable_all_nolock();
				3438	r = -EBUSY;
				3439	}
				3440	}
				3441
				3442	raw_spin_unlock(&kvm_count_lock);
				3443
				3444	return r;
				3445	}
				3446
				3447	static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
				3448	void *v)
				3449	{
				3450	/*
				3451	* Some (well, at least mine) BIOSes hang on reboot if
				3452	* in vmx root mode.
				3453	*
				3454	* And Intel TXT required VMX off for all cpu when system shutdown.
				3455	*/
				3456	pr_info("kvm: exiting hardware virtualization\n");
				3457	kvm_rebooting = true;
				3458	on_each_cpu(hardware_disable_nolock, NULL, 1);
				3459	return NOTIFY_OK;
				3460	}
				3461
				3462	static struct notifier_block kvm_reboot_notifier = {
				3463	.notifier_call = kvm_reboot,
				3464	.priority = 0,
				3465	};
				3466
				3467	static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
				3468	{
				3469	int i;
				3470
				3471	for (i = 0; i < bus->dev_count; i++) {
				3472	struct kvm_io_device *pos = bus->range[i].dev;
				3473
				3474	kvm_iodevice_destructor(pos);
				3475	}
				3476	kfree(bus);
				3477	}
				3478
				3479	static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
				3480	const struct kvm_io_range *r2)
				3481	{
				3482	gpa_t addr1 = r1->addr;
				3483	gpa_t addr2 = r2->addr;
				3484
				3485	if (addr1 < addr2)
				3486	return -1;
				3487
				3488	/* If r2->len == 0, match the exact address. If r2->len != 0,
				3489	* accept any overlapping write. Any order is acceptable for
				3490	* overlapping ranges, because kvm_io_bus_get_first_dev ensures
				3491	* we process all of them.
				3492	*/
				3493	if (r2->len) {
				3494	addr1 += r1->len;
				3495	addr2 += r2->len;
				3496	}
				3497
				3498	if (addr1 > addr2)
				3499	return 1;
				3500
				3501	return 0;
				3502	}
				3503
				3504	static int kvm_io_bus_sort_cmp(const void p1, const void p2)
				3505	{
				3506	return kvm_io_bus_cmp(p1, p2);
				3507	}
				3508
				3509	static int kvm_io_bus_insert_dev(struct kvm_io_bus bus, struct kvm_io_device dev,
				3510	gpa_t addr, int len)
				3511	{
				3512	bus->range[bus->dev_count++] = (struct kvm_io_range) {
				3513	.addr = addr,
				3514	.len = len,
				3515	.dev = dev,
				3516	};
				3517
				3518	sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
				3519	kvm_io_bus_sort_cmp, NULL);
				3520
				3521	return 0;
				3522	}
				3523
				3524	static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
				3525	gpa_t addr, int len)
				3526	{
				3527	struct kvm_io_range *range, key;
				3528	int off;
				3529
				3530	key = (struct kvm_io_range) {
				3531	.addr = addr,
				3532	.len = len,
				3533	};
				3534
				3535	range = bsearch(&key, bus->range, bus->dev_count,
				3536	sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
				3537	if (range == NULL)
				3538	return -ENOENT;
				3539
				3540	off = range - bus->range;
				3541
				3542	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
				3543	off--;
				3544
				3545	return off;
				3546	}
				3547
				3548	static int __kvm_io_bus_write(struct kvm_vcpu vcpu, struct kvm_io_bus bus,
				3549	struct kvm_io_range range, const void val)
				3550	{
				3551	int idx;
				3552
				3553	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
				3554	if (idx < 0)
				3555	return -EOPNOTSUPP;
				3556
				3557	while (idx < bus->dev_count &&
				3558	kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
				3559	if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
				3560	range->len, val))
				3561	return idx;
				3562	idx++;
				3563	}
				3564
				3565	return -EOPNOTSUPP;
				3566	}
				3567
				3568	/* kvm_io_bus_write - called under kvm->slots_lock */
				3569	int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
				3570	int len, const void *val)
				3571	{
				3572	struct kvm_io_bus *bus;
				3573	struct kvm_io_range range;
				3574	int r;
				3575
				3576	range = (struct kvm_io_range) {
				3577	.addr = addr,
				3578	.len = len,
				3579	};
				3580
				3581	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3582	if (!bus)
				3583	return -ENOMEM;
				3584	r = __kvm_io_bus_write(vcpu, bus, &range, val);
				3585	return r < 0 ? r : 0;
				3586	}
				3587
				3588	/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
				3589	int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
				3590	gpa_t addr, int len, const void *val, long cookie)
				3591	{
				3592	struct kvm_io_bus *bus;
				3593	struct kvm_io_range range;
				3594
				3595	range = (struct kvm_io_range) {
				3596	.addr = addr,
				3597	.len = len,
				3598	};
				3599
				3600	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3601	if (!bus)
				3602	return -ENOMEM;
				3603
				3604	/* First try the device referenced by cookie. */
				3605	if ((cookie >= 0) && (cookie < bus->dev_count) &&
				3606	(kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
				3607	if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
				3608	val))
				3609	return cookie;
				3610
				3611	/*
				3612	* cookie contained garbage; fall back to search and return the
				3613	* correct cookie value.
				3614	*/
				3615	return __kvm_io_bus_write(vcpu, bus, &range, val);
				3616	}
				3617
				3618	static int __kvm_io_bus_read(struct kvm_vcpu vcpu, struct kvm_io_bus bus,
				3619	struct kvm_io_range range, void val)
				3620	{
				3621	int idx;
				3622
				3623	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
				3624	if (idx < 0)
				3625	return -EOPNOTSUPP;
				3626
				3627	while (idx < bus->dev_count &&
				3628	kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
				3629	if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
				3630	range->len, val))
				3631	return idx;
				3632	idx++;
				3633	}
				3634
				3635	return -EOPNOTSUPP;
				3636	}
				3637	EXPORT_SYMBOL_GPL(kvm_io_bus_write);
				3638
				3639	/* kvm_io_bus_read - called under kvm->slots_lock */
				3640	int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
				3641	int len, void *val)
				3642	{
				3643	struct kvm_io_bus *bus;
				3644	struct kvm_io_range range;
				3645	int r;
				3646
				3647	range = (struct kvm_io_range) {
				3648	.addr = addr,
				3649	.len = len,
				3650	};
				3651
				3652	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
				3653	if (!bus)
				3654	return -ENOMEM;
				3655	r = __kvm_io_bus_read(vcpu, bus, &range, val);
				3656	return r < 0 ? r : 0;
				3657	}
				3658
				3659
				3660	/* Caller must hold slots_lock. */
				3661	int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
				3662	int len, struct kvm_io_device *dev)
				3663	{
				3664	struct kvm_io_bus new_bus, bus;
				3665
				3666	bus = kvm_get_bus(kvm, bus_idx);
				3667	if (!bus)
				3668	return -ENOMEM;
				3669
				3670	/* exclude ioeventfd which is limited by maximum fd */
				3671	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
				3672	return -ENOSPC;
				3673
				3674	new_bus = kmalloc(sizeof(bus) + ((bus->dev_count + 1)
				3675	sizeof(struct kvm_io_range)), GFP_KERNEL);
				3676	if (!new_bus)
				3677	return -ENOMEM;
				3678	memcpy(new_bus, bus, sizeof(bus) + (bus->dev_count
				3679	sizeof(struct kvm_io_range)));
				3680	kvm_io_bus_insert_dev(new_bus, dev, addr, len);
				3681	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
				3682	synchronize_srcu_expedited(&kvm->srcu);
				3683	kfree(bus);
				3684
				3685	return 0;
				3686	}
				3687
				3688	/* Caller must hold slots_lock. */
				3689	void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
				3690	struct kvm_io_device *dev)
				3691	{
				3692	int i, j;
				3693	struct kvm_io_bus new_bus, bus;
				3694
				3695	bus = kvm_get_bus(kvm, bus_idx);
				3696	if (!bus)
				3697	return;
				3698
				3699	for (i = 0; i < bus->dev_count; i++)
				3700	if (bus->range[i].dev == dev) {
				3701	break;
				3702	}
				3703
				3704	if (i == bus->dev_count)
				3705	return;
				3706
				3707	new_bus = kmalloc(sizeof(bus) + ((bus->dev_count - 1)
				3708	sizeof(struct kvm_io_range)), GFP_KERNEL);
				3709	if (new_bus) {
				3710	memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
				3711	new_bus->dev_count--;
				3712	memcpy(new_bus->range + i, bus->range + i + 1,
				3713	(new_bus->dev_count - i) * sizeof(struct kvm_io_range));
				3714	} else {
				3715	pr_err("kvm: failed to shrink bus, removing it completely\n");
				3716	for (j = 0; j < bus->dev_count; j++) {
				3717	if (j == i)
				3718	continue;
				3719	kvm_iodevice_destructor(bus->range[j].dev);
				3720	}
				3721	}
				3722
				3723	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
				3724	synchronize_srcu_expedited(&kvm->srcu);
				3725	kfree(bus);
				3726	return;
				3727	}
				3728
				3729	struct kvm_io_device kvm_io_bus_get_dev(struct kvm kvm, enum kvm_bus bus_idx,
				3730	gpa_t addr)
				3731	{
				3732	struct kvm_io_bus *bus;
				3733	int dev_idx, srcu_idx;
				3734	struct kvm_io_device *iodev = NULL;
				3735
				3736	srcu_idx = srcu_read_lock(&kvm->srcu);
				3737
				3738	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
				3739	if (!bus)
				3740	goto out_unlock;
				3741
				3742	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
				3743	if (dev_idx < 0)
				3744	goto out_unlock;
				3745
				3746	iodev = bus->range[dev_idx].dev;
				3747
				3748	out_unlock:
				3749	srcu_read_unlock(&kvm->srcu, srcu_idx);
				3750
				3751	return iodev;
				3752	}
				3753	EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
				3754
				3755	static int kvm_debugfs_open(struct inode inode, struct file file,
				3756	int (get)(void , u64 ), int (set)(void *, u64),
				3757	const char *fmt)
				3758	{
				3759	struct kvm_stat_data stat_data = (struct kvm_stat_data )
				3760	inode->i_private;
				3761
				3762	/* The debugfs files are a reference to the kvm struct which
				3763	* is still valid when kvm_destroy_vm is called.
				3764	* To avoid the race between open and the removal of the debugfs
				3765	* directory we test against the users count.
				3766	*/
				3767	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
				3768	return -ENOENT;
				3769
				3770	if (simple_attr_open(inode, file, get,
				3771	stat_data->mode & S_IWUGO ? set : NULL,
				3772	fmt)) {
				3773	kvm_put_kvm(stat_data->kvm);
				3774	return -ENOMEM;
				3775	}
				3776
				3777	return 0;
				3778	}
				3779
				3780	static int kvm_debugfs_release(struct inode inode, struct file file)
				3781	{
				3782	struct kvm_stat_data stat_data = (struct kvm_stat_data )
				3783	inode->i_private;
				3784
				3785	simple_attr_release(inode, file);
				3786	kvm_put_kvm(stat_data->kvm);
				3787
				3788	return 0;
				3789	}
				3790
				3791	static int vm_stat_get_per_vm(void data, u64 val)
				3792	{
				3793	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3794
				3795	val = (ulong )((void )stat_data->kvm + stat_data->offset);
				3796
				3797	return 0;
				3798	}
				3799
				3800	static int vm_stat_clear_per_vm(void *data, u64 val)
				3801	{
				3802	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3803
				3804	if (val)
				3805	return -EINVAL;
				3806
				3807	(ulong )((void *)stat_data->kvm + stat_data->offset) = 0;
				3808
				3809	return 0;
				3810	}
				3811
				3812	static int vm_stat_get_per_vm_open(struct inode inode, struct file file)
				3813	{
				3814	__simple_attr_check_format("%llu\n", 0ull);
				3815	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
				3816	vm_stat_clear_per_vm, "%llu\n");
				3817	}
				3818
				3819	static const struct file_operations vm_stat_get_per_vm_fops = {
				3820	.owner = THIS_MODULE,
				3821	.open = vm_stat_get_per_vm_open,
				3822	.release = kvm_debugfs_release,
				3823	.read = simple_attr_read,
				3824	.write = simple_attr_write,
				3825	.llseek = no_llseek,
				3826	};
				3827
				3828	static int vcpu_stat_get_per_vm(void data, u64 val)
				3829	{
				3830	int i;
				3831	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3832	struct kvm_vcpu *vcpu;
				3833
				3834	*val = 0;
				3835
				3836	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
				3837	val += (u64 )((void )vcpu + stat_data->offset);
				3838
				3839	return 0;
				3840	}
				3841
				3842	static int vcpu_stat_clear_per_vm(void *data, u64 val)
				3843	{
				3844	int i;
				3845	struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
				3846	struct kvm_vcpu *vcpu;
				3847
				3848	if (val)
				3849	return -EINVAL;
				3850
				3851	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
				3852	(u64 )((void *)vcpu + stat_data->offset) = 0;
				3853
				3854	return 0;
				3855	}
				3856
				3857	static int vcpu_stat_get_per_vm_open(struct inode inode, struct file file)
				3858	{
				3859	__simple_attr_check_format("%llu\n", 0ull);
				3860	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
				3861	vcpu_stat_clear_per_vm, "%llu\n");
				3862	}
				3863
				3864	static const struct file_operations vcpu_stat_get_per_vm_fops = {
				3865	.owner = THIS_MODULE,
				3866	.open = vcpu_stat_get_per_vm_open,
				3867	.release = kvm_debugfs_release,
				3868	.read = simple_attr_read,
				3869	.write = simple_attr_write,
				3870	.llseek = no_llseek,
				3871	};
				3872
				3873	static const struct file_operations *stat_fops_per_vm[] = {
				3874	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
				3875	[KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
				3876	};
				3877
				3878	static int vm_stat_get(void _offset, u64 val)
				3879	{
				3880	unsigned offset = (long)_offset;
				3881	struct kvm *kvm;
				3882	struct kvm_stat_data stat_tmp = {.offset = offset};
				3883	u64 tmp_val;
				3884
				3885	*val = 0;
				3886	mutex_lock(&kvm_lock);
				3887	list_for_each_entry(kvm, &vm_list, vm_list) {
				3888	stat_tmp.kvm = kvm;
				3889	vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
				3890	*val += tmp_val;
				3891	}
				3892	mutex_unlock(&kvm_lock);
				3893	return 0;
				3894	}
				3895
				3896	static int vm_stat_clear(void *_offset, u64 val)
				3897	{
				3898	unsigned offset = (long)_offset;
				3899	struct kvm *kvm;
				3900	struct kvm_stat_data stat_tmp = {.offset = offset};
				3901
				3902	if (val)
				3903	return -EINVAL;
				3904
				3905	mutex_lock(&kvm_lock);
				3906	list_for_each_entry(kvm, &vm_list, vm_list) {
				3907	stat_tmp.kvm = kvm;
				3908	vm_stat_clear_per_vm((void *)&stat_tmp, 0);
				3909	}
				3910	mutex_unlock(&kvm_lock);
				3911
				3912	return 0;
				3913	}
				3914
				3915	DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
				3916
				3917	static int vcpu_stat_get(void _offset, u64 val)
				3918	{
				3919	unsigned offset = (long)_offset;
				3920	struct kvm *kvm;
				3921	struct kvm_stat_data stat_tmp = {.offset = offset};
				3922	u64 tmp_val;
				3923
				3924	*val = 0;
				3925	mutex_lock(&kvm_lock);
				3926	list_for_each_entry(kvm, &vm_list, vm_list) {
				3927	stat_tmp.kvm = kvm;
				3928	vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
				3929	*val += tmp_val;
				3930	}
				3931	mutex_unlock(&kvm_lock);
				3932	return 0;
				3933	}
				3934
				3935	static int vcpu_stat_clear(void *_offset, u64 val)
				3936	{
				3937	unsigned offset = (long)_offset;
				3938	struct kvm *kvm;
				3939	struct kvm_stat_data stat_tmp = {.offset = offset};
				3940
				3941	if (val)
				3942	return -EINVAL;
				3943
				3944	mutex_lock(&kvm_lock);
				3945	list_for_each_entry(kvm, &vm_list, vm_list) {
				3946	stat_tmp.kvm = kvm;
				3947	vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
				3948	}
				3949	mutex_unlock(&kvm_lock);
				3950
				3951	return 0;
				3952	}
				3953
				3954	DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
				3955	"%llu\n");
				3956
				3957	static const struct file_operations *stat_fops[] = {
				3958	[KVM_STAT_VCPU] = &vcpu_stat_fops,
				3959	[KVM_STAT_VM] = &vm_stat_fops,
				3960	};
				3961
				3962	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
				3963	{
				3964	struct kobj_uevent_env *env;
				3965	unsigned long long created, active;
				3966
				3967	if (!kvm_dev.this_device \|\| !kvm)
				3968	return;
				3969
				3970	mutex_lock(&kvm_lock);
				3971	if (type == KVM_EVENT_CREATE_VM) {
				3972	kvm_createvm_count++;
				3973	kvm_active_vms++;
				3974	} else if (type == KVM_EVENT_DESTROY_VM) {
				3975	kvm_active_vms--;
				3976	}
				3977	created = kvm_createvm_count;
				3978	active = kvm_active_vms;
				3979	mutex_unlock(&kvm_lock);
				3980
				3981	env = kzalloc(sizeof(*env), GFP_KERNEL);
				3982	if (!env)
				3983	return;
				3984
				3985	add_uevent_var(env, "CREATED=%llu", created);
				3986	add_uevent_var(env, "COUNT=%llu", active);
				3987
				3988	if (type == KVM_EVENT_CREATE_VM) {
				3989	add_uevent_var(env, "EVENT=create");
				3990	kvm->userspace_pid = task_pid_nr(current);
				3991	} else if (type == KVM_EVENT_DESTROY_VM) {
				3992	add_uevent_var(env, "EVENT=destroy");
				3993	}
				3994	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
				3995
				3996	if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
				3997	char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL);
				3998
				3999	if (p) {
				4000	tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
				4001	if (!IS_ERR(tmp))
				4002	add_uevent_var(env, "STATS_PATH=%s", tmp);
				4003	kfree(p);
				4004	}
				4005	}
				4006	/* no need for checks, since we are adding at most only 5 keys */
				4007	env->envp[env->envp_idx++] = NULL;
				4008	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
				4009	kfree(env);
				4010	}
				4011
				4012	static int kvm_init_debug(void)
				4013	{
				4014	int r = -EEXIST;
				4015	struct kvm_stats_debugfs_item *p;
				4016
				4017	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
				4018	if (kvm_debugfs_dir == NULL)
				4019	goto out;
				4020
				4021	kvm_debugfs_num_entries = 0;
				4022	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
				4023	int mode = p->mode ? p->mode : 0644;
				4024	if (!debugfs_create_file(p->name, mode, kvm_debugfs_dir,
				4025	(void *)(long)p->offset,
				4026	stat_fops[p->kind]))
				4027	goto out_dir;
				4028	}
				4029
				4030	return 0;
				4031
				4032	out_dir:
				4033	debugfs_remove_recursive(kvm_debugfs_dir);
				4034	out:
				4035	return r;
				4036	}
				4037
				4038	static int kvm_suspend(void)
				4039	{
				4040	if (kvm_usage_count)
				4041	hardware_disable_nolock(NULL);
				4042	return 0;
				4043	}
				4044
				4045	static void kvm_resume(void)
				4046	{
				4047	if (kvm_usage_count) {
				4048	WARN_ON(raw_spin_is_locked(&kvm_count_lock));
				4049	hardware_enable_nolock(NULL);
				4050	}
				4051	}
				4052
				4053	static struct syscore_ops kvm_syscore_ops = {
				4054	.suspend = kvm_suspend,
				4055	.resume = kvm_resume,
				4056	};
				4057
				4058	static inline
				4059	struct kvm_vcpu preempt_notifier_to_vcpu(struct preempt_notifier pn)
				4060	{
				4061	return container_of(pn, struct kvm_vcpu, preempt_notifier);
				4062	}
				4063
				4064	static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
				4065	{
				4066	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
				4067
				4068	if (vcpu->preempted)
				4069	vcpu->preempted = false;
				4070
				4071	kvm_arch_sched_in(vcpu, cpu);
				4072
				4073	kvm_arch_vcpu_load(vcpu, cpu);
				4074	}
				4075
				4076	static void kvm_sched_out(struct preempt_notifier *pn,
				4077	struct task_struct *next)
				4078	{
				4079	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
				4080
				4081	if (current->state == TASK_RUNNING)
				4082	vcpu->preempted = true;
				4083	kvm_arch_vcpu_put(vcpu);
				4084	}
				4085
				4086	int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
				4087	struct module *module)
				4088	{
				4089	int r;
				4090	int cpu;
				4091
				4092	r = kvm_arch_init(opaque);
				4093	if (r)
				4094	goto out_fail;
				4095
				4096	/*
				4097	* kvm_arch_init makes sure there's at most one caller
				4098	* for architectures that support multiple implementations,
				4099	* like intel and amd on x86.
				4100	* kvm_arch_init must be called before kvm_irqfd_init to avoid creating
				4101	* conflicts in case kvm is already setup for another implementation.
				4102	*/
				4103	r = kvm_irqfd_init();
				4104	if (r)
				4105	goto out_irqfd;
				4106
				4107	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
				4108	r = -ENOMEM;
				4109	goto out_free_0;
				4110	}
				4111
				4112	r = kvm_arch_hardware_setup();
				4113	if (r < 0)
				4114	goto out_free_0a;
				4115
				4116	for_each_online_cpu(cpu) {
				4117	smp_call_function_single(cpu,
				4118	kvm_arch_check_processor_compat,
				4119	&r, 1);
				4120	if (r < 0)
				4121	goto out_free_1;
				4122	}
				4123
				4124	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
				4125	kvm_starting_cpu, kvm_dying_cpu);
				4126	if (r)
				4127	goto out_free_2;
				4128	register_reboot_notifier(&kvm_reboot_notifier);
				4129
				4130	/* A kmem cache lets us meet the alignment requirements of fx_save. */
				4131	if (!vcpu_align)
				4132	vcpu_align = __alignof__(struct kvm_vcpu);
				4133	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
				4134	SLAB_ACCOUNT, NULL);
				4135	if (!kvm_vcpu_cache) {
				4136	r = -ENOMEM;
				4137	goto out_free_3;
				4138	}
				4139
				4140	r = kvm_async_pf_init();
				4141	if (r)
				4142	goto out_free;
				4143
				4144	kvm_chardev_ops.owner = module;
				4145	kvm_vm_fops.owner = module;
				4146	kvm_vcpu_fops.owner = module;
				4147
				4148	r = misc_register(&kvm_dev);
				4149	if (r) {
				4150	pr_err("kvm: misc device register failed\n");
				4151	goto out_unreg;
				4152	}
				4153
				4154	register_syscore_ops(&kvm_syscore_ops);
				4155
				4156	kvm_preempt_ops.sched_in = kvm_sched_in;
				4157	kvm_preempt_ops.sched_out = kvm_sched_out;
				4158
				4159	r = kvm_init_debug();
				4160	if (r) {
				4161	pr_err("kvm: create debugfs files failed\n");
				4162	goto out_undebugfs;
				4163	}
				4164
				4165	r = kvm_vfio_ops_init();
				4166	WARN_ON(r);
				4167
				4168	return 0;
				4169
				4170	out_undebugfs:
				4171	unregister_syscore_ops(&kvm_syscore_ops);
				4172	misc_deregister(&kvm_dev);
				4173	out_unreg:
				4174	kvm_async_pf_deinit();
				4175	out_free:
				4176	kmem_cache_destroy(kvm_vcpu_cache);
				4177	out_free_3:
				4178	unregister_reboot_notifier(&kvm_reboot_notifier);
				4179	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
				4180	out_free_2:
				4181	out_free_1:
				4182	kvm_arch_hardware_unsetup();
				4183	out_free_0a:
				4184	free_cpumask_var(cpus_hardware_enabled);
				4185	out_free_0:
				4186	kvm_irqfd_exit();
				4187	out_irqfd:
				4188	kvm_arch_exit();
				4189	out_fail:
				4190	return r;
				4191	}
				4192	EXPORT_SYMBOL_GPL(kvm_init);
				4193
				4194	void kvm_exit(void)
				4195	{
				4196	debugfs_remove_recursive(kvm_debugfs_dir);
				4197	misc_deregister(&kvm_dev);
				4198	kmem_cache_destroy(kvm_vcpu_cache);
				4199	kvm_async_pf_deinit();
				4200	unregister_syscore_ops(&kvm_syscore_ops);
				4201	unregister_reboot_notifier(&kvm_reboot_notifier);
				4202	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
				4203	on_each_cpu(hardware_disable_nolock, NULL, 1);
				4204	kvm_arch_hardware_unsetup();
				4205	kvm_arch_exit();
				4206	kvm_irqfd_exit();
				4207	free_cpumask_var(cpus_hardware_enabled);
				4208	kvm_vfio_ops_exit();
				4209	}
				4210	EXPORT_SYMBOL_GPL(kvm_exit);
				4211
				4212	struct kvm_vm_worker_thread_context {
				4213	struct kvm *kvm;
				4214	struct task_struct *parent;
				4215	struct completion init_done;
				4216	kvm_vm_thread_fn_t thread_fn;
				4217	uintptr_t data;
				4218	int err;
				4219	};
				4220
				4221	static int kvm_vm_worker_thread(void *context)
				4222	{
				4223	/*
				4224	* The init_context is allocated on the stack of the parent thread, so
				4225	* we have to locally copy anything that is needed beyond initialization
				4226	*/
				4227	struct kvm_vm_worker_thread_context *init_context = context;
				4228	struct kvm *kvm = init_context->kvm;
				4229	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
				4230	uintptr_t data = init_context->data;
				4231	int err;
				4232
				4233	err = kthread_park(current);
				4234	/* kthread_park(current) is never supposed to return an error */
				4235	WARN_ON(err != 0);
				4236	if (err)
				4237	goto init_complete;
				4238
				4239	err = cgroup_attach_task_all(init_context->parent, current);
				4240	if (err) {
				4241	kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
				4242	__func__, err);
				4243	goto init_complete;
				4244	}
				4245
				4246	set_user_nice(current, task_nice(init_context->parent));
				4247
				4248	init_complete:
				4249	init_context->err = err;
				4250	complete(&init_context->init_done);
				4251	init_context = NULL;
				4252
				4253	if (err)
				4254	return err;
				4255
				4256	/* Wait to be woken up by the spawner before proceeding. */
				4257	kthread_parkme();
				4258
				4259	if (!kthread_should_stop())
				4260	err = thread_fn(kvm, data);
				4261
				4262	return err;
				4263	}
				4264
				4265	int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
				4266	uintptr_t data, const char *name,
				4267	struct task_struct **thread_ptr)
				4268	{
				4269	struct kvm_vm_worker_thread_context init_context = {};
				4270	struct task_struct *thread;
				4271
				4272	*thread_ptr = NULL;
				4273	init_context.kvm = kvm;
				4274	init_context.parent = current;
				4275	init_context.thread_fn = thread_fn;
				4276	init_context.data = data;
				4277	init_completion(&init_context.init_done);
				4278
				4279	thread = kthread_run(kvm_vm_worker_thread, &init_context,
				4280	"%s-%d", name, task_pid_nr(current));
				4281	if (IS_ERR(thread))
				4282	return PTR_ERR(thread);
				4283
				4284	/* kthread_run is never supposed to return NULL */
				4285	WARN_ON(thread == NULL);
				4286
				4287	wait_for_completion(&init_context.init_done);
				4288
				4289	if (!init_context.err)
				4290	*thread_ptr = thread;
				4291
				4292	return init_context.err;
				4293	}