Blame - src/kernel/linux/v4.14/arch/x86/kvm/x86.c - T103

blob: 79fa55de635cc9968de380ab71fb13bbce18d570 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* derived from drivers/kvm/kvm_main.c
				5	*
				6	* Copyright (C) 2006 Qumranet, Inc.
				7	* Copyright (C) 2008 Qumranet, Inc.
				8	* Copyright IBM Corporation, 2008
				9	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				10	*
				11	* Authors:
				12	* Avi Kivity <avi@qumranet.com>
				13	* Yaniv Kamay <yaniv@qumranet.com>
				14	* Amit Shah <amit.shah@qumranet.com>
				15	* Ben-Ami Yassour <benami@il.ibm.com>
				16	*
				17	* This work is licensed under the terms of the GNU GPL, version 2. See
				18	* the COPYING file in the top-level directory.
				19	*
				20	*/
				21
				22	#include <linux/kvm_host.h>
				23	#include "irq.h"
				24	#include "mmu.h"
				25	#include "i8254.h"
				26	#include "tss.h"
				27	#include "kvm_cache_regs.h"
				28	#include "x86.h"
				29	#include "cpuid.h"
				30	#include "pmu.h"
				31	#include "hyperv.h"
				32
				33	#include <linux/clocksource.h>
				34	#include <linux/interrupt.h>
				35	#include <linux/kvm.h>
				36	#include <linux/fs.h>
				37	#include <linux/vmalloc.h>
				38	#include <linux/export.h>
				39	#include <linux/moduleparam.h>
				40	#include <linux/mman.h>
				41	#include <linux/highmem.h>
				42	#include <linux/iommu.h>
				43	#include <linux/intel-iommu.h>
				44	#include <linux/cpufreq.h>
				45	#include <linux/user-return-notifier.h>
				46	#include <linux/srcu.h>
				47	#include <linux/slab.h>
				48	#include <linux/perf_event.h>
				49	#include <linux/uaccess.h>
				50	#include <linux/hash.h>
				51	#include <linux/pci.h>
				52	#include <linux/timekeeper_internal.h>
				53	#include <linux/pvclock_gtod.h>
				54	#include <linux/kvm_irqfd.h>
				55	#include <linux/irqbypass.h>
				56	#include <linux/sched/stat.h>
				57	#include <linux/mem_encrypt.h>
				58
				59	#include <trace/events/kvm.h>
				60
				61	#include <asm/debugreg.h>
				62	#include <asm/msr.h>
				63	#include <asm/desc.h>
				64	#include <asm/mce.h>
				65	#include <linux/kernel_stat.h>
				66	#include <asm/fpu/internal.h> /* Ugh! */
				67	#include <asm/pvclock.h>
				68	#include <asm/div64.h>
				69	#include <asm/irq_remapping.h>
				70
				71	#define CREATE_TRACE_POINTS
				72	#include "trace.h"
				73
				74	#define MAX_IO_MSRS 256
				75	#define KVM_MAX_MCE_BANKS 32
				76	u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P \| MCG_SER_P;
				77	EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
				78
				79	#define emul_to_vcpu(ctxt) \
				80	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
				81
				82	/* EFER defaults:
				83	* - enable syscall per default because its emulated by KVM
				84	* - enable LME and LMA per default on 64 bit KVM
				85	*/
				86	#ifdef CONFIG_X86_64
				87	static
				88	u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE \| EFER_LME \| EFER_LMA));
				89	#else
				90	static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
				91	#endif
				92
				93	#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
				94	#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
				95
				96	#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS \| \
				97	KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
				98
				99	static void update_cr8_intercept(struct kvm_vcpu *vcpu);
				100	static void process_nmi(struct kvm_vcpu *vcpu);
				101	static void enter_smm(struct kvm_vcpu *vcpu);
				102	static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
				103
				104	struct kvm_x86_ops *kvm_x86_ops __read_mostly;
				105	EXPORT_SYMBOL_GPL(kvm_x86_ops);
				106
				107	static bool __read_mostly ignore_msrs = 0;
				108	module_param(ignore_msrs, bool, S_IRUGO \| S_IWUSR);
				109
				110	unsigned int min_timer_period_us = 500;
				111	module_param(min_timer_period_us, uint, S_IRUGO \| S_IWUSR);
				112
				113	static bool __read_mostly kvmclock_periodic_sync = true;
				114	module_param(kvmclock_periodic_sync, bool, S_IRUGO);
				115
				116	bool __read_mostly kvm_has_tsc_control;
				117	EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
				118	u32 __read_mostly kvm_max_guest_tsc_khz;
				119	EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
				120	u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
				121	EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
				122	u64 __read_mostly kvm_max_tsc_scaling_ratio;
				123	EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
				124	u64 __read_mostly kvm_default_tsc_scaling_ratio;
				125	EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
				126
				127	/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
				128	static u32 __read_mostly tsc_tolerance_ppm = 250;
				129	module_param(tsc_tolerance_ppm, uint, S_IRUGO \| S_IWUSR);
				130
				131	/* lapic timer advance (tscdeadline mode only) in nanoseconds */
				132	unsigned int __read_mostly lapic_timer_advance_ns = 0;
				133	module_param(lapic_timer_advance_ns, uint, S_IRUGO \| S_IWUSR);
				134
				135	static bool __read_mostly vector_hashing = true;
				136	module_param(vector_hashing, bool, S_IRUGO);
				137
				138	#define KVM_NR_SHARED_MSRS 16
				139
				140	struct kvm_shared_msrs_global {
				141	int nr;
				142	u32 msrs[KVM_NR_SHARED_MSRS];
				143	};
				144
				145	struct kvm_shared_msrs {
				146	struct user_return_notifier urn;
				147	bool registered;
				148	struct kvm_shared_msr_values {
				149	u64 host;
				150	u64 curr;
				151	} values[KVM_NR_SHARED_MSRS];
				152	};
				153
				154	static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
				155	static struct kvm_shared_msrs __percpu *shared_msrs;
				156
				157	struct kvm_stats_debugfs_item debugfs_entries[] = {
				158	{ "pf_fixed", VCPU_STAT(pf_fixed) },
				159	{ "pf_guest", VCPU_STAT(pf_guest) },
				160	{ "tlb_flush", VCPU_STAT(tlb_flush) },
				161	{ "invlpg", VCPU_STAT(invlpg) },
				162	{ "exits", VCPU_STAT(exits) },
				163	{ "io_exits", VCPU_STAT(io_exits) },
				164	{ "mmio_exits", VCPU_STAT(mmio_exits) },
				165	{ "signal_exits", VCPU_STAT(signal_exits) },
				166	{ "irq_window", VCPU_STAT(irq_window_exits) },
				167	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
				168	{ "halt_exits", VCPU_STAT(halt_exits) },
				169	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
				170	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
				171	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
				172	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
				173	{ "hypercalls", VCPU_STAT(hypercalls) },
				174	{ "request_irq", VCPU_STAT(request_irq_exits) },
				175	{ "irq_exits", VCPU_STAT(irq_exits) },
				176	{ "host_state_reload", VCPU_STAT(host_state_reload) },
				177	{ "efer_reload", VCPU_STAT(efer_reload) },
				178	{ "fpu_reload", VCPU_STAT(fpu_reload) },
				179	{ "insn_emulation", VCPU_STAT(insn_emulation) },
				180	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
				181	{ "irq_injections", VCPU_STAT(irq_injections) },
				182	{ "nmi_injections", VCPU_STAT(nmi_injections) },
				183	{ "req_event", VCPU_STAT(req_event) },
				184	{ "l1d_flush", VCPU_STAT(l1d_flush) },
				185	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
				186	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
				187	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
				188	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
				189	{ "mmu_flooded", VM_STAT(mmu_flooded) },
				190	{ "mmu_recycled", VM_STAT(mmu_recycled) },
				191	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
				192	{ "mmu_unsync", VM_STAT(mmu_unsync) },
				193	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
				194	{ "largepages", VM_STAT(lpages, .mode = 0444) },
				195	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
				196	{ "max_mmu_page_hash_collisions",
				197	VM_STAT(max_mmu_page_hash_collisions) },
				198	{ NULL }
				199	};
				200
				201	u64 __read_mostly host_xcr0;
				202
				203	static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
				204
				205	static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
				206	{
				207	int i;
				208	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
				209	vcpu->arch.apf.gfns[i] = ~0;
				210	}
				211
				212	static void kvm_on_user_return(struct user_return_notifier *urn)
				213	{
				214	unsigned slot;
				215	struct kvm_shared_msrs *locals
				216	= container_of(urn, struct kvm_shared_msrs, urn);
				217	struct kvm_shared_msr_values *values;
				218	unsigned long flags;
				219
				220	/*
				221	* Disabling irqs at this point since the following code could be
				222	* interrupted and executed through kvm_arch_hardware_disable()
				223	*/
				224	local_irq_save(flags);
				225	if (locals->registered) {
				226	locals->registered = false;
				227	user_return_notifier_unregister(urn);
				228	}
				229	local_irq_restore(flags);
				230	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
				231	values = &locals->values[slot];
				232	if (values->host != values->curr) {
				233	wrmsrl(shared_msrs_global.msrs[slot], values->host);
				234	values->curr = values->host;
				235	}
				236	}
				237	}
				238
				239	static void shared_msr_update(unsigned slot, u32 msr)
				240	{
				241	u64 value;
				242	unsigned int cpu = smp_processor_id();
				243	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
				244
				245	/* only read, and nobody should modify it at this time,
				246	* so don't need lock */
				247	if (slot >= shared_msrs_global.nr) {
				248	printk(KERN_ERR "kvm: invalid MSR slot!");
				249	return;
				250	}
				251	rdmsrl_safe(msr, &value);
				252	smsr->values[slot].host = value;
				253	smsr->values[slot].curr = value;
				254	}
				255
				256	void kvm_define_shared_msr(unsigned slot, u32 msr)
				257	{
				258	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
				259	shared_msrs_global.msrs[slot] = msr;
				260	if (slot >= shared_msrs_global.nr)
				261	shared_msrs_global.nr = slot + 1;
				262	}
				263	EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
				264
				265	static void kvm_shared_msr_cpu_online(void)
				266	{
				267	unsigned i;
				268
				269	for (i = 0; i < shared_msrs_global.nr; ++i)
				270	shared_msr_update(i, shared_msrs_global.msrs[i]);
				271	}
				272
				273	int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
				274	{
				275	unsigned int cpu = smp_processor_id();
				276	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
				277	int err;
				278
				279	value = (value & mask) \| (smsr->values[slot].host & ~mask);
				280	if (value == smsr->values[slot].curr)
				281	return 0;
				282	err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
				283	if (err)
				284	return 1;
				285
				286	smsr->values[slot].curr = value;
				287	if (!smsr->registered) {
				288	smsr->urn.on_user_return = kvm_on_user_return;
				289	user_return_notifier_register(&smsr->urn);
				290	smsr->registered = true;
				291	}
				292	return 0;
				293	}
				294	EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
				295
				296	static void drop_user_return_notifiers(void)
				297	{
				298	unsigned int cpu = smp_processor_id();
				299	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
				300
				301	if (smsr->registered)
				302	kvm_on_user_return(&smsr->urn);
				303	}
				304
				305	u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
				306	{
				307	return vcpu->arch.apic_base;
				308	}
				309	EXPORT_SYMBOL_GPL(kvm_get_apic_base);
				310
				311	enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
				312	{
				313	return kvm_apic_mode(kvm_get_apic_base(vcpu));
				314	}
				315	EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
				316
				317	int kvm_set_apic_base(struct kvm_vcpu vcpu, struct msr_data msr_info)
				318	{
				319	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
				320	enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
				321	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) \| 0x2ff \|
				322	(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
				323
				324	if ((msr_info->data & reserved_bits) != 0 \|\| new_mode == LAPIC_MODE_INVALID)
				325	return 1;
				326	if (!msr_info->host_initiated) {
				327	if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
				328	return 1;
				329	if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
				330	return 1;
				331	}
				332
				333	kvm_lapic_set_base(vcpu, msr_info->data);
				334	return 0;
				335	}
				336	EXPORT_SYMBOL_GPL(kvm_set_apic_base);
				337
				338	asmlinkage __visible void kvm_spurious_fault(void)
				339	{
				340	/* Fault while not rebooting. We want the trace. */
				341	BUG();
				342	}
				343	EXPORT_SYMBOL_GPL(kvm_spurious_fault);
				344
				345	#define EXCPT_BENIGN 0
				346	#define EXCPT_CONTRIBUTORY 1
				347	#define EXCPT_PF 2
				348
				349	static int exception_class(int vector)
				350	{
				351	switch (vector) {
				352	case PF_VECTOR:
				353	return EXCPT_PF;
				354	case DE_VECTOR:
				355	case TS_VECTOR:
				356	case NP_VECTOR:
				357	case SS_VECTOR:
				358	case GP_VECTOR:
				359	return EXCPT_CONTRIBUTORY;
				360	default:
				361	break;
				362	}
				363	return EXCPT_BENIGN;
				364	}
				365
				366	#define EXCPT_FAULT 0
				367	#define EXCPT_TRAP 1
				368	#define EXCPT_ABORT 2
				369	#define EXCPT_INTERRUPT 3
				370
				371	static int exception_type(int vector)
				372	{
				373	unsigned int mask;
				374
				375	if (WARN_ON(vector > 31 \|\| vector == NMI_VECTOR))
				376	return EXCPT_INTERRUPT;
				377
				378	mask = 1 << vector;
				379
				380	/* #DB is trap, as instruction watchpoints are handled elsewhere */
				381	if (mask & ((1 << DB_VECTOR) \| (1 << BP_VECTOR) \| (1 << OF_VECTOR)))
				382	return EXCPT_TRAP;
				383
				384	if (mask & ((1 << DF_VECTOR) \| (1 << MC_VECTOR)))
				385	return EXCPT_ABORT;
				386
				387	/* Reserved exceptions will result in fault */
				388	return EXCPT_FAULT;
				389	}
				390
				391	static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
				392	unsigned nr, bool has_error, u32 error_code,
				393	bool reinject)
				394	{
				395	u32 prev_nr;
				396	int class1, class2;
				397
				398	kvm_make_request(KVM_REQ_EVENT, vcpu);
				399
				400	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
				401	queue:
				402	if (has_error && !is_protmode(vcpu))
				403	has_error = false;
				404	if (reinject) {
				405	/*
				406	* On vmentry, vcpu->arch.exception.pending is only
				407	* true if an event injection was blocked by
				408	* nested_run_pending. In that case, however,
				409	* vcpu_enter_guest requests an immediate exit,
				410	* and the guest shouldn't proceed far enough to
				411	* need reinjection.
				412	*/
				413	WARN_ON_ONCE(vcpu->arch.exception.pending);
				414	vcpu->arch.exception.injected = true;
				415	} else {
				416	vcpu->arch.exception.pending = true;
				417	vcpu->arch.exception.injected = false;
				418	}
				419	vcpu->arch.exception.has_error_code = has_error;
				420	vcpu->arch.exception.nr = nr;
				421	vcpu->arch.exception.error_code = error_code;
				422	return;
				423	}
				424
				425	/* to check exception */
				426	prev_nr = vcpu->arch.exception.nr;
				427	if (prev_nr == DF_VECTOR) {
				428	/* triple fault -> shutdown */
				429	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				430	return;
				431	}
				432	class1 = exception_class(prev_nr);
				433	class2 = exception_class(nr);
				434	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
				435	\|\| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
				436	/*
				437	* Generate double fault per SDM Table 5-5. Set
				438	* exception.pending = true so that the double fault
				439	* can trigger a nested vmexit.
				440	*/
				441	vcpu->arch.exception.pending = true;
				442	vcpu->arch.exception.injected = false;
				443	vcpu->arch.exception.has_error_code = true;
				444	vcpu->arch.exception.nr = DF_VECTOR;
				445	vcpu->arch.exception.error_code = 0;
				446	} else
				447	/* replace previous exception with a new one in a hope
				448	that instruction re-execution will regenerate lost
				449	exception */
				450	goto queue;
				451	}
				452
				453	void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
				454	{
				455	kvm_multiple_exception(vcpu, nr, false, 0, false);
				456	}
				457	EXPORT_SYMBOL_GPL(kvm_queue_exception);
				458
				459	void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
				460	{
				461	kvm_multiple_exception(vcpu, nr, false, 0, true);
				462	}
				463	EXPORT_SYMBOL_GPL(kvm_requeue_exception);
				464
				465	int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
				466	{
				467	if (err)
				468	kvm_inject_gp(vcpu, 0);
				469	else
				470	return kvm_skip_emulated_instruction(vcpu);
				471
				472	return 1;
				473	}
				474	EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
				475
				476	void kvm_inject_page_fault(struct kvm_vcpu vcpu, struct x86_exception fault)
				477	{
				478	++vcpu->stat.pf_guest;
				479	vcpu->arch.exception.nested_apf =
				480	is_guest_mode(vcpu) && fault->async_page_fault;
				481	if (vcpu->arch.exception.nested_apf)
				482	vcpu->arch.apf.nested_apf_token = fault->address;
				483	else
				484	vcpu->arch.cr2 = fault->address;
				485	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
				486	}
				487	EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
				488
				489	static bool kvm_propagate_fault(struct kvm_vcpu vcpu, struct x86_exception fault)
				490	{
				491	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
				492	vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
				493	else
				494	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
				495
				496	return fault->nested_page_fault;
				497	}
				498
				499	void kvm_inject_nmi(struct kvm_vcpu *vcpu)
				500	{
				501	atomic_inc(&vcpu->arch.nmi_queued);
				502	kvm_make_request(KVM_REQ_NMI, vcpu);
				503	}
				504	EXPORT_SYMBOL_GPL(kvm_inject_nmi);
				505
				506	void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
				507	{
				508	kvm_multiple_exception(vcpu, nr, true, error_code, false);
				509	}
				510	EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
				511
				512	void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
				513	{
				514	kvm_multiple_exception(vcpu, nr, true, error_code, true);
				515	}
				516	EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
				517
				518	/*
				519	* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
				520	* a #GP and return false.
				521	*/
				522	bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
				523	{
				524	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
				525	return true;
				526	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
				527	return false;
				528	}
				529	EXPORT_SYMBOL_GPL(kvm_require_cpl);
				530
				531	bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
				532	{
				533	if ((dr != 4 && dr != 5) \|\| !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
				534	return true;
				535
				536	kvm_queue_exception(vcpu, UD_VECTOR);
				537	return false;
				538	}
				539	EXPORT_SYMBOL_GPL(kvm_require_dr);
				540
				541	/*
				542	* This function will be used to read from the physical memory of the currently
				543	* running guest. The difference to kvm_vcpu_read_guest_page is that this function
				544	* can read from guest physical or from the guest's guest physical memory.
				545	*/
				546	int kvm_read_guest_page_mmu(struct kvm_vcpu vcpu, struct kvm_mmu mmu,
				547	gfn_t ngfn, void *data, int offset, int len,
				548	u32 access)
				549	{
				550	struct x86_exception exception;
				551	gfn_t real_gfn;
				552	gpa_t ngpa;
				553
				554	ngpa = gfn_to_gpa(ngfn);
				555	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
				556	if (real_gfn == UNMAPPED_GVA)
				557	return -EFAULT;
				558
				559	real_gfn = gpa_to_gfn(real_gfn);
				560
				561	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
				562	}
				563	EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
				564
				565	static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
				566	void *data, int offset, int len, u32 access)
				567	{
				568	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
				569	data, offset, len, access);
				570	}
				571
				572	static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
				573	{
				574	return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) \| rsvd_bits(5, 8) \|
				575	rsvd_bits(1, 2);
				576	}
				577
				578	/*
				579	* Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
				580	*/
				581	int load_pdptrs(struct kvm_vcpu vcpu, struct kvm_mmu mmu, unsigned long cr3)
				582	{
				583	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
				584	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
				585	int i;
				586	int ret;
				587	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
				588
				589	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
				590	offset * sizeof(u64), sizeof(pdpte),
				591	PFERR_USER_MASK\|PFERR_WRITE_MASK);
				592	if (ret < 0) {
				593	ret = 0;
				594	goto out;
				595	}
				596	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
				597	if ((pdpte[i] & PT_PRESENT_MASK) &&
				598	(pdpte[i] & pdptr_rsvd_bits(vcpu))) {
				599	ret = 0;
				600	goto out;
				601	}
				602	}
				603	ret = 1;
				604
				605	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
				606	__set_bit(VCPU_EXREG_PDPTR,
				607	(unsigned long *)&vcpu->arch.regs_avail);
				608	__set_bit(VCPU_EXREG_PDPTR,
				609	(unsigned long *)&vcpu->arch.regs_dirty);
				610	out:
				611
				612	return ret;
				613	}
				614	EXPORT_SYMBOL_GPL(load_pdptrs);
				615
				616	bool pdptrs_changed(struct kvm_vcpu *vcpu)
				617	{
				618	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
				619	bool changed = true;
				620	int offset;
				621	gfn_t gfn;
				622	int r;
				623
				624	if (!is_pae_paging(vcpu))
				625	return false;
				626
				627	if (!test_bit(VCPU_EXREG_PDPTR,
				628	(unsigned long *)&vcpu->arch.regs_avail))
				629	return true;
				630
				631	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
				632	offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
				633	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
				634	PFERR_USER_MASK \| PFERR_WRITE_MASK);
				635	if (r < 0)
				636	goto out;
				637	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
				638	out:
				639
				640	return changed;
				641	}
				642	EXPORT_SYMBOL_GPL(pdptrs_changed);
				643
				644	int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
				645	{
				646	unsigned long old_cr0 = kvm_read_cr0(vcpu);
				647	unsigned long update_bits = X86_CR0_PG \| X86_CR0_WP;
				648
				649	cr0 \|= X86_CR0_ET;
				650
				651	#ifdef CONFIG_X86_64
				652	if (cr0 & 0xffffffff00000000UL)
				653	return 1;
				654	#endif
				655
				656	cr0 &= ~CR0_RESERVED_BITS;
				657
				658	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
				659	return 1;
				660
				661	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
				662	return 1;
				663
				664	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
				665	#ifdef CONFIG_X86_64
				666	if ((vcpu->arch.efer & EFER_LME)) {
				667	int cs_db, cs_l;
				668
				669	if (!is_pae(vcpu))
				670	return 1;
				671	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
				672	if (cs_l)
				673	return 1;
				674	} else
				675	#endif
				676	if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				677	kvm_read_cr3(vcpu)))
				678	return 1;
				679	}
				680
				681	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
				682	return 1;
				683
				684	kvm_x86_ops->set_cr0(vcpu, cr0);
				685
				686	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
				687	kvm_clear_async_pf_completion_queue(vcpu);
				688	kvm_async_pf_hash_reset(vcpu);
				689	}
				690
				691	if ((cr0 ^ old_cr0) & update_bits)
				692	kvm_mmu_reset_context(vcpu);
				693
				694	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
				695	kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
				696	!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
				697	kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
				698
				699	return 0;
				700	}
				701	EXPORT_SYMBOL_GPL(kvm_set_cr0);
				702
				703	void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
				704	{
				705	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) \| (msw & 0x0f));
				706	}
				707	EXPORT_SYMBOL_GPL(kvm_lmsw);
				708
				709	static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
				710	{
				711	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
				712	!vcpu->guest_xcr0_loaded) {
				713	/* kvm_set_xcr() also depends on this */
				714	xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
				715	vcpu->guest_xcr0_loaded = 1;
				716	}
				717	}
				718
				719	static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
				720	{
				721	if (vcpu->guest_xcr0_loaded) {
				722	if (vcpu->arch.xcr0 != host_xcr0)
				723	xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
				724	vcpu->guest_xcr0_loaded = 0;
				725	}
				726	}
				727
				728	static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
				729	{
				730	u64 xcr0 = xcr;
				731	u64 old_xcr0 = vcpu->arch.xcr0;
				732	u64 valid_bits;
				733
				734	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
				735	if (index != XCR_XFEATURE_ENABLED_MASK)
				736	return 1;
				737	if (!(xcr0 & XFEATURE_MASK_FP))
				738	return 1;
				739	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
				740	return 1;
				741
				742	/*
				743	* Do not allow the guest to set bits that we do not support
				744	* saving. However, xcr0 bit 0 is always set, even if the
				745	* emulated CPU does not support XSAVE (see fx_init).
				746	*/
				747	valid_bits = vcpu->arch.guest_supported_xcr0 \| XFEATURE_MASK_FP;
				748	if (xcr0 & ~valid_bits)
				749	return 1;
				750
				751	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
				752	(!(xcr0 & XFEATURE_MASK_BNDCSR)))
				753	return 1;
				754
				755	if (xcr0 & XFEATURE_MASK_AVX512) {
				756	if (!(xcr0 & XFEATURE_MASK_YMM))
				757	return 1;
				758	if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
				759	return 1;
				760	}
				761	vcpu->arch.xcr0 = xcr0;
				762
				763	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
				764	kvm_update_cpuid(vcpu);
				765	return 0;
				766	}
				767
				768	int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
				769	{
				770	if (kvm_x86_ops->get_cpl(vcpu) != 0 \|\|
				771	__kvm_set_xcr(vcpu, index, xcr)) {
				772	kvm_inject_gp(vcpu, 0);
				773	return 1;
				774	}
				775	return 0;
				776	}
				777	EXPORT_SYMBOL_GPL(kvm_set_xcr);
				778
				779	int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
				780	{
				781	unsigned long old_cr4 = kvm_read_cr4(vcpu);
				782	unsigned long pdptr_bits = X86_CR4_PGE \| X86_CR4_PSE \| X86_CR4_PAE \|
				783	X86_CR4_SMEP \| X86_CR4_SMAP \| X86_CR4_PKE;
				784
				785	if (cr4 & CR4_RESERVED_BITS)
				786	return 1;
				787
				788	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
				789	return 1;
				790
				791	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
				792	return 1;
				793
				794	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
				795	return 1;
				796
				797	if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
				798	return 1;
				799
				800	if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
				801	return 1;
				802
				803	if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
				804	return 1;
				805
				806	if (is_long_mode(vcpu)) {
				807	if (!(cr4 & X86_CR4_PAE))
				808	return 1;
				809	if ((cr4 ^ old_cr4) & X86_CR4_LA57)
				810	return 1;
				811	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
				812	&& ((cr4 ^ old_cr4) & pdptr_bits)
				813	&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				814	kvm_read_cr3(vcpu)))
				815	return 1;
				816
				817	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
				818	if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
				819	return 1;
				820
				821	/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
				822	if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) \|\| !is_long_mode(vcpu))
				823	return 1;
				824	}
				825
				826	if (kvm_x86_ops->set_cr4(vcpu, cr4))
				827	return 1;
				828
				829	if (((cr4 ^ old_cr4) & pdptr_bits) \|\|
				830	(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
				831	kvm_mmu_reset_context(vcpu);
				832
				833	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE \| X86_CR4_PKE))
				834	kvm_update_cpuid(vcpu);
				835
				836	return 0;
				837	}
				838	EXPORT_SYMBOL_GPL(kvm_set_cr4);
				839
				840	int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
				841	{
				842	#ifdef CONFIG_X86_64
				843	cr3 &= ~CR3_PCID_INVD;
				844	#endif
				845
				846	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
				847	kvm_mmu_sync_roots(vcpu);
				848	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				849	return 0;
				850	}
				851
				852	if (is_long_mode(vcpu) &&
				853	(cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
				854	return 1;
				855	else if (is_pae_paging(vcpu) &&
				856	!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
				857	return 1;
				858
				859	vcpu->arch.cr3 = cr3;
				860	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
				861	kvm_mmu_new_cr3(vcpu);
				862	return 0;
				863	}
				864	EXPORT_SYMBOL_GPL(kvm_set_cr3);
				865
				866	int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
				867	{
				868	if (cr8 & CR8_RESERVED_BITS)
				869	return 1;
				870	if (lapic_in_kernel(vcpu))
				871	kvm_lapic_set_tpr(vcpu, cr8);
				872	else
				873	vcpu->arch.cr8 = cr8;
				874	return 0;
				875	}
				876	EXPORT_SYMBOL_GPL(kvm_set_cr8);
				877
				878	unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
				879	{
				880	if (lapic_in_kernel(vcpu))
				881	return kvm_lapic_get_cr8(vcpu);
				882	else
				883	return vcpu->arch.cr8;
				884	}
				885	EXPORT_SYMBOL_GPL(kvm_get_cr8);
				886
				887	static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
				888	{
				889	int i;
				890
				891	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
				892	for (i = 0; i < KVM_NR_DB_REGS; i++)
				893	vcpu->arch.eff_db[i] = vcpu->arch.db[i];
				894	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_RELOAD;
				895	}
				896	}
				897
				898	static void kvm_update_dr6(struct kvm_vcpu *vcpu)
				899	{
				900	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
				901	kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
				902	}
				903
				904	static void kvm_update_dr7(struct kvm_vcpu *vcpu)
				905	{
				906	unsigned long dr7;
				907
				908	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
				909	dr7 = vcpu->arch.guest_debug_dr7;
				910	else
				911	dr7 = vcpu->arch.dr7;
				912	kvm_x86_ops->set_dr7(vcpu, dr7);
				913	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
				914	if (dr7 & DR7_BP_EN_MASK)
				915	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_BP_ENABLED;
				916	}
				917
				918	static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
				919	{
				920	u64 fixed = DR6_FIXED_1;
				921
				922	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
				923	fixed \|= DR6_RTM;
				924	return fixed;
				925	}
				926
				927	static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
				928	{
				929	size_t size = ARRAY_SIZE(vcpu->arch.db);
				930
				931	switch (dr) {
				932	case 0 ... 3:
				933	vcpu->arch.db[array_index_nospec(dr, size)] = val;
				934	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
				935	vcpu->arch.eff_db[dr] = val;
				936	break;
				937	case 4:
				938	/* fall through */
				939	case 6:
				940	if (val & 0xffffffff00000000ULL)
				941	return -1; /* #GP */
				942	vcpu->arch.dr6 = (val & DR6_VOLATILE) \| kvm_dr6_fixed(vcpu);
				943	kvm_update_dr6(vcpu);
				944	break;
				945	case 5:
				946	/* fall through */
				947	default: /* 7 */
				948	if (val & 0xffffffff00000000ULL)
				949	return -1; /* #GP */
				950	vcpu->arch.dr7 = (val & DR7_VOLATILE) \| DR7_FIXED_1;
				951	kvm_update_dr7(vcpu);
				952	break;
				953	}
				954
				955	return 0;
				956	}
				957
				958	int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
				959	{
				960	if (__kvm_set_dr(vcpu, dr, val)) {
				961	kvm_inject_gp(vcpu, 0);
				962	return 1;
				963	}
				964	return 0;
				965	}
				966	EXPORT_SYMBOL_GPL(kvm_set_dr);
				967
				968	int kvm_get_dr(struct kvm_vcpu vcpu, int dr, unsigned long val)
				969	{
				970	size_t size = ARRAY_SIZE(vcpu->arch.db);
				971
				972	switch (dr) {
				973	case 0 ... 3:
				974	*val = vcpu->arch.db[array_index_nospec(dr, size)];
				975	break;
				976	case 4:
				977	/* fall through */
				978	case 6:
				979	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
				980	*val = vcpu->arch.dr6;
				981	else
				982	*val = kvm_x86_ops->get_dr6(vcpu);
				983	break;
				984	case 5:
				985	/* fall through */
				986	default: /* 7 */
				987	*val = vcpu->arch.dr7;
				988	break;
				989	}
				990	return 0;
				991	}
				992	EXPORT_SYMBOL_GPL(kvm_get_dr);
				993
				994	bool kvm_rdpmc(struct kvm_vcpu *vcpu)
				995	{
				996	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
				997	u64 data;
				998	int err;
				999
				1000	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
				1001	if (err)
				1002	return err;
				1003	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
				1004	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
				1005	return err;
				1006	}
				1007	EXPORT_SYMBOL_GPL(kvm_rdpmc);
				1008
				1009	/*
				1010	* List of msr numbers which we expose to userspace through KVM_GET_MSRS
				1011	* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
				1012	*
				1013	* This list is modified at module load time to reflect the
				1014	* capabilities of the host cpu. This capabilities test skips MSRs that are
				1015	* kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
				1016	* may depend on host virtualization features rather than host cpu features.
				1017	*/
				1018
				1019	static u32 msrs_to_save[] = {
				1020	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
				1021	MSR_STAR,
				1022	#ifdef CONFIG_X86_64
				1023	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
				1024	#endif
				1025	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
				1026	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
				1027	MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
				1028	};
				1029
				1030	static unsigned num_msrs_to_save;
				1031
				1032	static u32 emulated_msrs[] = {
				1033	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
				1034	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
				1035	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
				1036	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
				1037	HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
				1038	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
				1039	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
				1040	HV_X64_MSR_RESET,
				1041	HV_X64_MSR_VP_INDEX,
				1042	HV_X64_MSR_VP_RUNTIME,
				1043	HV_X64_MSR_SCONTROL,
				1044	HV_X64_MSR_STIMER0_CONFIG,
				1045	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
				1046	MSR_KVM_PV_EOI_EN,
				1047
				1048	MSR_IA32_TSC_ADJUST,
				1049	MSR_IA32_TSCDEADLINE,
				1050	MSR_IA32_MISC_ENABLE,
				1051	MSR_IA32_MCG_STATUS,
				1052	MSR_IA32_MCG_CTL,
				1053	MSR_IA32_MCG_EXT_CTL,
				1054	MSR_IA32_SMBASE,
				1055	MSR_PLATFORM_INFO,
				1056	MSR_MISC_FEATURES_ENABLES,
				1057	MSR_AMD64_VIRT_SPEC_CTRL,
				1058	};
				1059
				1060	static unsigned num_emulated_msrs;
				1061
				1062	/*
				1063	* List of msr numbers which are used to expose MSR-based features that
				1064	* can be used by a hypervisor to validate requested CPU features.
				1065	*/
				1066	static u32 msr_based_features[] = {
				1067	MSR_F10H_DECFG,
				1068	MSR_IA32_UCODE_REV,
				1069	MSR_IA32_ARCH_CAPABILITIES,
				1070	};
				1071
				1072	static unsigned int num_msr_based_features;
				1073
				1074	u64 kvm_get_arch_capabilities(void)
				1075	{
				1076	u64 data;
				1077
				1078	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
				1079
				1080	/*
				1081	* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
				1082	* the nested hypervisor runs with NX huge pages. If it is not,
				1083	* L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
				1084	* L1 guests, so it need not worry about its own (L2) guests.
				1085	*/
				1086	data \|= ARCH_CAP_PSCHANGE_MC_NO;
				1087
				1088	/*
				1089	* If we're doing cache flushes (either "always" or "cond")
				1090	* we will do one whenever the guest does a vmlaunch/vmresume.
				1091	* If an outer hypervisor is doing the cache flush for us
				1092	* (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
				1093	* capability to the guest too, and if EPT is disabled we're not
				1094	* vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
				1095	* require a nested hypervisor to do a flush of its own.
				1096	*/
				1097	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
				1098	data \|= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
				1099
				1100	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
				1101	data \|= ARCH_CAP_RDCL_NO;
				1102	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
				1103	data \|= ARCH_CAP_SSB_NO;
				1104	if (!boot_cpu_has_bug(X86_BUG_MDS))
				1105	data \|= ARCH_CAP_MDS_NO;
				1106
				1107	/*
				1108	* On TAA affected systems, export MDS_NO=0 when:
				1109	* - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
				1110	* - Updated microcode is present. This is detected by
				1111	* the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
				1112	* that VERW clears CPU buffers.
				1113	*
				1114	* When MDS_NO=0 is exported, guests deploy clear CPU buffer
				1115	* mitigation and don't complain:
				1116	*
				1117	* "Vulnerable: Clear CPU buffers attempted, no microcode"
				1118	*
				1119	* If TSX is disabled on the system, guests are also mitigated against
				1120	* TAA and clear CPU buffer mitigation is not required for guests.
				1121	*/
				1122	if (!boot_cpu_has(X86_FEATURE_RTM))
				1123	data &= ~ARCH_CAP_TAA_NO;
				1124	else if (!boot_cpu_has_bug(X86_BUG_TAA))
				1125	data \|= ARCH_CAP_TAA_NO;
				1126	else if (data & ARCH_CAP_TSX_CTRL_MSR)
				1127	data &= ~ARCH_CAP_MDS_NO;
				1128
				1129	/* KVM does not emulate MSR_IA32_TSX_CTRL. */
				1130	data &= ~ARCH_CAP_TSX_CTRL_MSR;
				1131	return data;
				1132	}
				1133
				1134	EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
				1135
				1136	static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
				1137	{
				1138	switch (msr->index) {
				1139	case MSR_IA32_ARCH_CAPABILITIES:
				1140	msr->data = kvm_get_arch_capabilities();
				1141	break;
				1142	case MSR_IA32_UCODE_REV:
				1143	rdmsrl_safe(msr->index, &msr->data);
				1144	break;
				1145	default:
				1146	if (kvm_x86_ops->get_msr_feature(msr))
				1147	return 1;
				1148	}
				1149	return 0;
				1150	}
				1151
				1152	static int do_get_msr_feature(struct kvm_vcpu vcpu, unsigned index, u64 data)
				1153	{
				1154	struct kvm_msr_entry msr;
				1155	int r;
				1156
				1157	msr.index = index;
				1158	r = kvm_get_msr_feature(&msr);
				1159	if (r)
				1160	return r;
				1161
				1162	*data = msr.data;
				1163
				1164	return 0;
				1165	}
				1166
				1167	static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
				1168	{
				1169	if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
				1170	return false;
				1171
				1172	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
				1173	return false;
				1174
				1175	return true;
				1176
				1177	}
				1178	bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
				1179	{
				1180	if (efer & efer_reserved_bits)
				1181	return false;
				1182
				1183	return __kvm_valid_efer(vcpu, efer);
				1184	}
				1185	EXPORT_SYMBOL_GPL(kvm_valid_efer);
				1186
				1187	static int set_efer(struct kvm_vcpu vcpu, struct msr_data msr_info)
				1188	{
				1189	u64 old_efer = vcpu->arch.efer;
				1190	u64 efer = msr_info->data;
				1191
				1192	if (efer & efer_reserved_bits)
				1193	return 1;
				1194
				1195	if (!msr_info->host_initiated) {
				1196	if (!__kvm_valid_efer(vcpu, efer))
				1197	return 1;
				1198
				1199	if (is_paging(vcpu) &&
				1200	(vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
				1201	return 1;
				1202	}
				1203
				1204	efer &= ~EFER_LMA;
				1205	efer \|= vcpu->arch.efer & EFER_LMA;
				1206
				1207	kvm_x86_ops->set_efer(vcpu, efer);
				1208
				1209	/* Update reserved bits */
				1210	if ((efer ^ old_efer) & EFER_NX)
				1211	kvm_mmu_reset_context(vcpu);
				1212
				1213	return 0;
				1214	}
				1215
				1216	void kvm_enable_efer_bits(u64 mask)
				1217	{
				1218	efer_reserved_bits &= ~mask;
				1219	}
				1220	EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
				1221
				1222	/*
				1223	* Writes msr value into into the appropriate "register".
				1224	* Returns 0 on success, non-0 otherwise.
				1225	* Assumes vcpu_load() was already called.
				1226	*/
				1227	int kvm_set_msr(struct kvm_vcpu vcpu, struct msr_data msr)
				1228	{
				1229	switch (msr->index) {
				1230	case MSR_FS_BASE:
				1231	case MSR_GS_BASE:
				1232	case MSR_KERNEL_GS_BASE:
				1233	case MSR_CSTAR:
				1234	case MSR_LSTAR:
				1235	if (is_noncanonical_address(msr->data, vcpu))
				1236	return 1;
				1237	break;
				1238	case MSR_IA32_SYSENTER_EIP:
				1239	case MSR_IA32_SYSENTER_ESP:
				1240	/*
				1241	* IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
				1242	* non-canonical address is written on Intel but not on
				1243	* AMD (which ignores the top 32-bits, because it does
				1244	* not implement 64-bit SYSENTER).
				1245	*
				1246	* 64-bit code should hence be able to write a non-canonical
				1247	* value on AMD. Making the address canonical ensures that
				1248	* vmentry does not fail on Intel after writing a non-canonical
				1249	* value, and that something deterministic happens if the guest
				1250	* invokes 64-bit SYSENTER.
				1251	*/
				1252	msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
				1253	}
				1254	return kvm_x86_ops->set_msr(vcpu, msr);
				1255	}
				1256	EXPORT_SYMBOL_GPL(kvm_set_msr);
				1257
				1258	/*
				1259	* Adapt set_msr() to msr_io()'s calling convention
				1260	*/
				1261	static int do_get_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
				1262	{
				1263	struct msr_data msr;
				1264	int r;
				1265
				1266	msr.index = index;
				1267	msr.host_initiated = true;
				1268	r = kvm_get_msr(vcpu, &msr);
				1269	if (r)
				1270	return r;
				1271
				1272	*data = msr.data;
				1273	return 0;
				1274	}
				1275
				1276	static int do_set_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
				1277	{
				1278	struct msr_data msr;
				1279
				1280	msr.data = *data;
				1281	msr.index = index;
				1282	msr.host_initiated = true;
				1283	return kvm_set_msr(vcpu, &msr);
				1284	}
				1285
				1286	#ifdef CONFIG_X86_64
				1287	struct pvclock_gtod_data {
				1288	seqcount_t seq;
				1289
				1290	struct { /* extract of a clocksource struct */
				1291	int vclock_mode;
				1292	u64 cycle_last;
				1293	u64 mask;
				1294	u32 mult;
				1295	u32 shift;
				1296	} clock;
				1297
				1298	u64 boot_ns;
				1299	u64 nsec_base;
				1300	u64 wall_time_sec;
				1301	};
				1302
				1303	static struct pvclock_gtod_data pvclock_gtod_data;
				1304
				1305	static void update_pvclock_gtod(struct timekeeper *tk)
				1306	{
				1307	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
				1308	u64 boot_ns;
				1309
				1310	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
				1311
				1312	write_seqcount_begin(&vdata->seq);
				1313
				1314	/* copy pvclock gtod data */
				1315	vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
				1316	vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
				1317	vdata->clock.mask = tk->tkr_mono.mask;
				1318	vdata->clock.mult = tk->tkr_mono.mult;
				1319	vdata->clock.shift = tk->tkr_mono.shift;
				1320
				1321	vdata->boot_ns = boot_ns;
				1322	vdata->nsec_base = tk->tkr_mono.xtime_nsec;
				1323
				1324	vdata->wall_time_sec = tk->xtime_sec;
				1325
				1326	write_seqcount_end(&vdata->seq);
				1327	}
				1328	#endif
				1329
				1330	void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
				1331	{
				1332	/*
				1333	* Note: KVM_REQ_PENDING_TIMER is implicitly checked in
				1334	* vcpu_enter_guest. This function is only called from
				1335	* the physical CPU that is running vcpu.
				1336	*/
				1337	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
				1338	}
				1339
				1340	static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
				1341	{
				1342	int version;
				1343	int r;
				1344	struct pvclock_wall_clock wc;
				1345	struct timespec64 boot;
				1346
				1347	if (!wall_clock)
				1348	return;
				1349
				1350	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
				1351	if (r)
				1352	return;
				1353
				1354	if (version & 1)
				1355	++version; /* first time write, random junk */
				1356
				1357	++version;
				1358
				1359	if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
				1360	return;
				1361
				1362	/*
				1363	* The guest calculates current wall clock time by adding
				1364	* system time (updated by kvm_guest_time_update below) to the
				1365	* wall clock specified here. guest system time equals host
				1366	* system time for us, thus we must fill in host boot time here.
				1367	*/
				1368	getboottime64(&boot);
				1369
				1370	if (kvm->arch.kvmclock_offset) {
				1371	struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
				1372	boot = timespec64_sub(boot, ts);
				1373	}
				1374	wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
				1375	wc.nsec = boot.tv_nsec;
				1376	wc.version = version;
				1377
				1378	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
				1379
				1380	version++;
				1381	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
				1382	}
				1383
				1384	static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
				1385	{
				1386	do_shl32_div32(dividend, divisor);
				1387	return dividend;
				1388	}
				1389
				1390	static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
				1391	s8 pshift, u32 pmultiplier)
				1392	{
				1393	uint64_t scaled64;
				1394	int32_t shift = 0;
				1395	uint64_t tps64;
				1396	uint32_t tps32;
				1397
				1398	tps64 = base_hz;
				1399	scaled64 = scaled_hz;
				1400	while (tps64 > scaled64*2 \|\| tps64 & 0xffffffff00000000ULL) {
				1401	tps64 >>= 1;
				1402	shift--;
				1403	}
				1404
				1405	tps32 = (uint32_t)tps64;
				1406	while (tps32 <= scaled64 \|\| scaled64 & 0xffffffff00000000ULL) {
				1407	if (scaled64 & 0xffffffff00000000ULL \|\| tps32 & 0x80000000)
				1408	scaled64 >>= 1;
				1409	else
				1410	tps32 <<= 1;
				1411	shift++;
				1412	}
				1413
				1414	*pshift = shift;
				1415	*pmultiplier = div_frac(scaled64, tps32);
				1416
				1417	pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
				1418	__func__, base_hz, scaled_hz, shift, *pmultiplier);
				1419	}
				1420
				1421	#ifdef CONFIG_X86_64
				1422	static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
				1423	#endif
				1424
				1425	static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
				1426	static unsigned long max_tsc_khz;
				1427
				1428	static u32 adjust_tsc_khz(u32 khz, s32 ppm)
				1429	{
				1430	u64 v = (u64)khz * (1000000 + ppm);
				1431	do_div(v, 1000000);
				1432	return v;
				1433	}
				1434
				1435	static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
				1436	{
				1437	u64 ratio;
				1438
				1439	/* Guest TSC same frequency as host TSC? */
				1440	if (!scale) {
				1441	vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
				1442	return 0;
				1443	}
				1444
				1445	/* TSC scaling supported? */
				1446	if (!kvm_has_tsc_control) {
				1447	if (user_tsc_khz > tsc_khz) {
				1448	vcpu->arch.tsc_catchup = 1;
				1449	vcpu->arch.tsc_always_catchup = 1;
				1450	return 0;
				1451	} else {
				1452	pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
				1453	return -1;
				1454	}
				1455	}
				1456
				1457	/* TSC scaling required - calculate ratio */
				1458	ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
				1459	user_tsc_khz, tsc_khz);
				1460
				1461	if (ratio == 0 \|\| ratio >= kvm_max_tsc_scaling_ratio) {
				1462	pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
				1463	user_tsc_khz);
				1464	return -1;
				1465	}
				1466
				1467	vcpu->arch.tsc_scaling_ratio = ratio;
				1468	return 0;
				1469	}
				1470
				1471	static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
				1472	{
				1473	u32 thresh_lo, thresh_hi;
				1474	int use_scaling = 0;
				1475
				1476	/* tsc_khz can be zero if TSC calibration fails */
				1477	if (user_tsc_khz == 0) {
				1478	/* set tsc_scaling_ratio to a safe value */
				1479	vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
				1480	return -1;
				1481	}
				1482
				1483	/* Compute a scale to convert nanoseconds in TSC cycles */
				1484	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
				1485	&vcpu->arch.virtual_tsc_shift,
				1486	&vcpu->arch.virtual_tsc_mult);
				1487	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
				1488
				1489	/*
				1490	* Compute the variation in TSC rate which is acceptable
				1491	* within the range of tolerance and decide if the
				1492	* rate being applied is within that bounds of the hardware
				1493	* rate. If so, no scaling or compensation need be done.
				1494	*/
				1495	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
				1496	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
				1497	if (user_tsc_khz < thresh_lo \|\| user_tsc_khz > thresh_hi) {
				1498	pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
				1499	use_scaling = 1;
				1500	}
				1501	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
				1502	}
				1503
				1504	static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
				1505	{
				1506	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
				1507	vcpu->arch.virtual_tsc_mult,
				1508	vcpu->arch.virtual_tsc_shift);
				1509	tsc += vcpu->arch.this_tsc_write;
				1510	return tsc;
				1511	}
				1512
				1513	static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
				1514	{
				1515	#ifdef CONFIG_X86_64
				1516	bool vcpus_matched;
				1517	struct kvm_arch *ka = &vcpu->kvm->arch;
				1518	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				1519
				1520	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
				1521	atomic_read(&vcpu->kvm->online_vcpus));
				1522
				1523	/*
				1524	* Once the masterclock is enabled, always perform request in
				1525	* order to update it.
				1526	*
				1527	* In order to enable masterclock, the host clocksource must be TSC
				1528	* and the vcpus need to have matched TSCs. When that happens,
				1529	* perform request to enable masterclock.
				1530	*/
				1531	if (ka->use_master_clock \|\|
				1532	(gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
				1533	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				1534
				1535	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
				1536	atomic_read(&vcpu->kvm->online_vcpus),
				1537	ka->use_master_clock, gtod->clock.vclock_mode);
				1538	#endif
				1539	}
				1540
				1541	static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
				1542	{
				1543	u64 curr_offset = vcpu->arch.tsc_offset;
				1544	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
				1545	}
				1546
				1547	/*
				1548	* Multiply tsc by a fixed point number represented by ratio.
				1549	*
				1550	* The most significant 64-N bits (mult) of ratio represent the
				1551	* integral part of the fixed point number; the remaining N bits
				1552	* (frac) represent the fractional part, ie. ratio represents a fixed
				1553	* point number (mult + frac * 2^(-N)).
				1554	*
				1555	* N equals to kvm_tsc_scaling_ratio_frac_bits.
				1556	*/
				1557	static inline u64 __scale_tsc(u64 ratio, u64 tsc)
				1558	{
				1559	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
				1560	}
				1561
				1562	u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
				1563	{
				1564	u64 _tsc = tsc;
				1565	u64 ratio = vcpu->arch.tsc_scaling_ratio;
				1566
				1567	if (ratio != kvm_default_tsc_scaling_ratio)
				1568	_tsc = __scale_tsc(ratio, tsc);
				1569
				1570	return _tsc;
				1571	}
				1572	EXPORT_SYMBOL_GPL(kvm_scale_tsc);
				1573
				1574	static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
				1575	{
				1576	u64 tsc;
				1577
				1578	tsc = kvm_scale_tsc(vcpu, rdtsc());
				1579
				1580	return target_tsc - tsc;
				1581	}
				1582
				1583	u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
				1584	{
				1585	return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
				1586	}
				1587	EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
				1588
				1589	static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
				1590	{
				1591	kvm_x86_ops->write_tsc_offset(vcpu, offset);
				1592	vcpu->arch.tsc_offset = offset;
				1593	}
				1594
				1595	void kvm_write_tsc(struct kvm_vcpu vcpu, struct msr_data msr)
				1596	{
				1597	struct kvm *kvm = vcpu->kvm;
				1598	u64 offset, ns, elapsed;
				1599	unsigned long flags;
				1600	bool matched;
				1601	bool already_matched;
				1602	u64 data = msr->data;
				1603	bool synchronizing = false;
				1604
				1605	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
				1606	offset = kvm_compute_tsc_offset(vcpu, data);
				1607	ns = ktime_get_boot_ns();
				1608	elapsed = ns - kvm->arch.last_tsc_nsec;
				1609
				1610	if (vcpu->arch.virtual_tsc_khz) {
				1611	if (data == 0 && msr->host_initiated) {
				1612	/*
				1613	* detection of vcpu initialization -- need to sync
				1614	* with other vCPUs. This particularly helps to keep
				1615	* kvm_clock stable after CPU hotplug
				1616	*/
				1617	synchronizing = true;
				1618	} else {
				1619	u64 tsc_exp = kvm->arch.last_tsc_write +
				1620	nsec_to_cycles(vcpu, elapsed);
				1621	u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
				1622	/*
				1623	* Special case: TSC write with a small delta (1 second)
				1624	* of virtual cycle time against real time is
				1625	* interpreted as an attempt to synchronize the CPU.
				1626	*/
				1627	synchronizing = data < tsc_exp + tsc_hz &&
				1628	data + tsc_hz > tsc_exp;
				1629	}
				1630	}
				1631
				1632	/*
				1633	* For a reliable TSC, we can match TSC offsets, and for an unstable
				1634	* TSC, we add elapsed time in this computation. We could let the
				1635	* compensation code attempt to catch up if we fall behind, but
				1636	* it's better to try to match offsets from the beginning.
				1637	*/
				1638	if (synchronizing &&
				1639	vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
				1640	if (!check_tsc_unstable()) {
				1641	offset = kvm->arch.cur_tsc_offset;
				1642	pr_debug("kvm: matched tsc offset for %llu\n", data);
				1643	} else {
				1644	u64 delta = nsec_to_cycles(vcpu, elapsed);
				1645	data += delta;
				1646	offset = kvm_compute_tsc_offset(vcpu, data);
				1647	pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
				1648	}
				1649	matched = true;
				1650	already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
				1651	} else {
				1652	/*
				1653	* We split periods of matched TSC writes into generations.
				1654	* For each generation, we track the original measured
				1655	* nanosecond time, offset, and write, so if TSCs are in
				1656	* sync, we can match exact offset, and if not, we can match
				1657	* exact software computation in compute_guest_tsc()
				1658	*
				1659	* These values are tracked in kvm->arch.cur_xxx variables.
				1660	*/
				1661	kvm->arch.cur_tsc_generation++;
				1662	kvm->arch.cur_tsc_nsec = ns;
				1663	kvm->arch.cur_tsc_write = data;
				1664	kvm->arch.cur_tsc_offset = offset;
				1665	matched = false;
				1666	pr_debug("kvm: new tsc generation %llu, clock %llu\n",
				1667	kvm->arch.cur_tsc_generation, data);
				1668	}
				1669
				1670	/*
				1671	* We also track th most recent recorded KHZ, write and time to
				1672	* allow the matching interval to be extended at each write.
				1673	*/
				1674	kvm->arch.last_tsc_nsec = ns;
				1675	kvm->arch.last_tsc_write = data;
				1676	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
				1677
				1678	vcpu->arch.last_guest_tsc = data;
				1679
				1680	/* Keep track of which generation this VCPU has synchronized to */
				1681	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
				1682	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
				1683	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
				1684
				1685	if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
				1686	update_ia32_tsc_adjust_msr(vcpu, offset);
				1687
				1688	kvm_vcpu_write_tsc_offset(vcpu, offset);
				1689	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
				1690
				1691	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
				1692	if (!matched) {
				1693	kvm->arch.nr_vcpus_matched_tsc = 0;
				1694	} else if (!already_matched) {
				1695	kvm->arch.nr_vcpus_matched_tsc++;
				1696	}
				1697
				1698	kvm_track_tsc_matching(vcpu);
				1699	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
				1700	}
				1701
				1702	EXPORT_SYMBOL_GPL(kvm_write_tsc);
				1703
				1704	static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
				1705	s64 adjustment)
				1706	{
				1707	kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
				1708	}
				1709
				1710	static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
				1711	{
				1712	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
				1713	WARN_ON(adjustment < 0);
				1714	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
				1715	adjust_tsc_offset_guest(vcpu, adjustment);
				1716	}
				1717
				1718	#ifdef CONFIG_X86_64
				1719
				1720	static u64 read_tsc(void)
				1721	{
				1722	u64 ret = (u64)rdtsc_ordered();
				1723	u64 last = pvclock_gtod_data.clock.cycle_last;
				1724
				1725	if (likely(ret >= last))
				1726	return ret;
				1727
				1728	/*
				1729	* GCC likes to generate cmov here, but this branch is extremely
				1730	* predictable (it's just a function of time and the likely is
				1731	* very likely) and there's a data dependence, so force GCC
				1732	* to generate a branch instead. I don't barrier() because
				1733	* we don't actually need a barrier, and if this function
				1734	* ever gets inlined it will generate worse code.
				1735	*/
				1736	asm volatile ("");
				1737	return last;
				1738	}
				1739
				1740	static inline u64 vgettsc(u64 *cycle_now)
				1741	{
				1742	long v;
				1743	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				1744
				1745	*cycle_now = read_tsc();
				1746
				1747	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
				1748	return v * gtod->clock.mult;
				1749	}
				1750
				1751	static int do_monotonic_boot(s64 t, u64 cycle_now)
				1752	{
				1753	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				1754	unsigned long seq;
				1755	int mode;
				1756	u64 ns;
				1757
				1758	do {
				1759	seq = read_seqcount_begin(&gtod->seq);
				1760	mode = gtod->clock.vclock_mode;
				1761	ns = gtod->nsec_base;
				1762	ns += vgettsc(cycle_now);
				1763	ns >>= gtod->clock.shift;
				1764	ns += gtod->boot_ns;
				1765	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
				1766	*t = ns;
				1767
				1768	return mode;
				1769	}
				1770
				1771	static int do_realtime(struct timespec ts, u64 cycle_now)
				1772	{
				1773	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				1774	unsigned long seq;
				1775	int mode;
				1776	u64 ns;
				1777
				1778	do {
				1779	seq = read_seqcount_begin(&gtod->seq);
				1780	mode = gtod->clock.vclock_mode;
				1781	ts->tv_sec = gtod->wall_time_sec;
				1782	ns = gtod->nsec_base;
				1783	ns += vgettsc(cycle_now);
				1784	ns >>= gtod->clock.shift;
				1785	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
				1786
				1787	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
				1788	ts->tv_nsec = ns;
				1789
				1790	return mode;
				1791	}
				1792
				1793	/* returns true if host is using tsc clocksource */
				1794	static bool kvm_get_time_and_clockread(s64 kernel_ns, u64 cycle_now)
				1795	{
				1796	/* checked again under seqlock below */
				1797	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
				1798	return false;
				1799
				1800	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
				1801	}
				1802
				1803	/* returns true if host is using tsc clocksource */
				1804	static bool kvm_get_walltime_and_clockread(struct timespec *ts,
				1805	u64 *cycle_now)
				1806	{
				1807	/* checked again under seqlock below */
				1808	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
				1809	return false;
				1810
				1811	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
				1812	}
				1813	#endif
				1814
				1815	/*
				1816	*
				1817	* Assuming a stable TSC across physical CPUS, and a stable TSC
				1818	* across virtual CPUs, the following condition is possible.
				1819	* Each numbered line represents an event visible to both
				1820	* CPUs at the next numbered event.
				1821	*
				1822	* "timespecX" represents host monotonic time. "tscX" represents
				1823	* RDTSC value.
				1824	*
				1825	* VCPU0 on CPU0 \| VCPU1 on CPU1
				1826	*
				1827	* 1. read timespec0,tsc0
				1828	* 2. \| timespec1 = timespec0 + N
				1829	* \| tsc1 = tsc0 + M
				1830	* 3. transition to guest \| transition to guest
				1831	* 4. ret0 = timespec0 + (rdtsc - tsc0) \|
				1832	* 5. \| ret1 = timespec1 + (rdtsc - tsc1)
				1833	* \| ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
				1834	*
				1835	* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
				1836	*
				1837	* - ret0 < ret1
				1838	* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
				1839	* ...
				1840	* - 0 < N - M => M < N
				1841	*
				1842	* That is, when timespec0 != timespec1, M < N. Unfortunately that is not
				1843	* always the case (the difference between two distinct xtime instances
				1844	* might be smaller then the difference between corresponding TSC reads,
				1845	* when updating guest vcpus pvclock areas).
				1846	*
				1847	* To avoid that problem, do not allow visibility of distinct
				1848	* system_timestamp/tsc_timestamp values simultaneously: use a master
				1849	* copy of host monotonic time values. Update that master copy
				1850	* in lockstep.
				1851	*
				1852	* Rely on synchronization of host TSCs and guest TSCs for monotonicity.
				1853	*
				1854	*/
				1855
				1856	static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
				1857	{
				1858	#ifdef CONFIG_X86_64
				1859	struct kvm_arch *ka = &kvm->arch;
				1860	int vclock_mode;
				1861	bool host_tsc_clocksource, vcpus_matched;
				1862
				1863	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
				1864	atomic_read(&kvm->online_vcpus));
				1865
				1866	/*
				1867	* If the host uses TSC clock, then passthrough TSC as stable
				1868	* to the guest.
				1869	*/
				1870	host_tsc_clocksource = kvm_get_time_and_clockread(
				1871	&ka->master_kernel_ns,
				1872	&ka->master_cycle_now);
				1873
				1874	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
				1875	&& !ka->backwards_tsc_observed
				1876	&& !ka->boot_vcpu_runs_old_kvmclock;
				1877
				1878	if (ka->use_master_clock)
				1879	atomic_set(&kvm_guest_has_master_clock, 1);
				1880
				1881	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
				1882	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
				1883	vcpus_matched);
				1884	#endif
				1885	}
				1886
				1887	void kvm_make_mclock_inprogress_request(struct kvm *kvm)
				1888	{
				1889	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
				1890	}
				1891
				1892	static void kvm_gen_update_masterclock(struct kvm *kvm)
				1893	{
				1894	#ifdef CONFIG_X86_64
				1895	int i;
				1896	struct kvm_vcpu *vcpu;
				1897	struct kvm_arch *ka = &kvm->arch;
				1898
				1899	spin_lock(&ka->pvclock_gtod_sync_lock);
				1900	kvm_make_mclock_inprogress_request(kvm);
				1901	/* no guest entries from this point */
				1902	pvclock_update_vm_gtod_copy(kvm);
				1903
				1904	kvm_for_each_vcpu(i, vcpu, kvm)
				1905	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				1906
				1907	/* guest entries allowed */
				1908	kvm_for_each_vcpu(i, vcpu, kvm)
				1909	kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
				1910
				1911	spin_unlock(&ka->pvclock_gtod_sync_lock);
				1912	#endif
				1913	}
				1914
				1915	u64 get_kvmclock_ns(struct kvm *kvm)
				1916	{
				1917	struct kvm_arch *ka = &kvm->arch;
				1918	struct pvclock_vcpu_time_info hv_clock;
				1919	u64 ret;
				1920
				1921	spin_lock(&ka->pvclock_gtod_sync_lock);
				1922	if (!ka->use_master_clock) {
				1923	spin_unlock(&ka->pvclock_gtod_sync_lock);
				1924	return ktime_get_boot_ns() + ka->kvmclock_offset;
				1925	}
				1926
				1927	hv_clock.tsc_timestamp = ka->master_cycle_now;
				1928	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
				1929	spin_unlock(&ka->pvclock_gtod_sync_lock);
				1930
				1931	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
				1932	get_cpu();
				1933
				1934	if (__this_cpu_read(cpu_tsc_khz)) {
				1935	kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				1936	&hv_clock.tsc_shift,
				1937	&hv_clock.tsc_to_system_mul);
				1938	ret = __pvclock_read_cycles(&hv_clock, rdtsc());
				1939	} else
				1940	ret = ktime_get_boot_ns() + ka->kvmclock_offset;
				1941
				1942	put_cpu();
				1943
				1944	return ret;
				1945	}
				1946
				1947	static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
				1948	{
				1949	struct kvm_vcpu_arch *vcpu = &v->arch;
				1950	struct pvclock_vcpu_time_info guest_hv_clock;
				1951
				1952	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
				1953	&guest_hv_clock, sizeof(guest_hv_clock))))
				1954	return;
				1955
				1956	/* This VCPU is paused, but it's legal for a guest to read another
				1957	* VCPU's kvmclock, so we really have to follow the specification where
				1958	* it says that version is odd if data is being modified, and even after
				1959	* it is consistent.
				1960	*
				1961	* Version field updates must be kept separate. This is because
				1962	* kvm_write_guest_cached might use a "rep movs" instruction, and
				1963	* writes within a string instruction are weakly ordered. So there
				1964	* are three writes overall.
				1965	*
				1966	* As a small optimization, only write the version field in the first
				1967	* and third write. The vcpu->pv_time cache is still valid, because the
				1968	* version field is the first in the struct.
				1969	*/
				1970	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
				1971
				1972	if (guest_hv_clock.version & 1)
				1973	++guest_hv_clock.version; /* first time write, random junk */
				1974
				1975	vcpu->hv_clock.version = guest_hv_clock.version + 1;
				1976	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				1977	&vcpu->hv_clock,
				1978	sizeof(vcpu->hv_clock.version));
				1979
				1980	smp_wmb();
				1981
				1982	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
				1983	vcpu->hv_clock.flags \|= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
				1984
				1985	if (vcpu->pvclock_set_guest_stopped_request) {
				1986	vcpu->hv_clock.flags \|= PVCLOCK_GUEST_STOPPED;
				1987	vcpu->pvclock_set_guest_stopped_request = false;
				1988	}
				1989
				1990	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
				1991
				1992	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				1993	&vcpu->hv_clock,
				1994	sizeof(vcpu->hv_clock));
				1995
				1996	smp_wmb();
				1997
				1998	vcpu->hv_clock.version++;
				1999	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				2000	&vcpu->hv_clock,
				2001	sizeof(vcpu->hv_clock.version));
				2002	}
				2003
				2004	static int kvm_guest_time_update(struct kvm_vcpu *v)
				2005	{
				2006	unsigned long flags, tgt_tsc_khz;
				2007	struct kvm_vcpu_arch *vcpu = &v->arch;
				2008	struct kvm_arch *ka = &v->kvm->arch;
				2009	s64 kernel_ns;
				2010	u64 tsc_timestamp, host_tsc;
				2011	u8 pvclock_flags;
				2012	bool use_master_clock;
				2013
				2014	kernel_ns = 0;
				2015	host_tsc = 0;
				2016
				2017	/*
				2018	* If the host uses TSC clock, then passthrough TSC as stable
				2019	* to the guest.
				2020	*/
				2021	spin_lock(&ka->pvclock_gtod_sync_lock);
				2022	use_master_clock = ka->use_master_clock;
				2023	if (use_master_clock) {
				2024	host_tsc = ka->master_cycle_now;
				2025	kernel_ns = ka->master_kernel_ns;
				2026	}
				2027	spin_unlock(&ka->pvclock_gtod_sync_lock);
				2028
				2029	/* Keep irq disabled to prevent changes to the clock */
				2030	local_irq_save(flags);
				2031	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
				2032	if (unlikely(tgt_tsc_khz == 0)) {
				2033	local_irq_restore(flags);
				2034	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
				2035	return 1;
				2036	}
				2037	if (!use_master_clock) {
				2038	host_tsc = rdtsc();
				2039	kernel_ns = ktime_get_boot_ns();
				2040	}
				2041
				2042	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
				2043
				2044	/*
				2045	* We may have to catch up the TSC to match elapsed wall clock
				2046	* time for two reasons, even if kvmclock is used.
				2047	* 1) CPU could have been running below the maximum TSC rate
				2048	* 2) Broken TSC compensation resets the base at each VCPU
				2049	* entry to avoid unknown leaps of TSC even when running
				2050	* again on the same CPU. This may cause apparent elapsed
				2051	* time to disappear, and the guest to stand still or run
				2052	* very slowly.
				2053	*/
				2054	if (vcpu->tsc_catchup) {
				2055	u64 tsc = compute_guest_tsc(v, kernel_ns);
				2056	if (tsc > tsc_timestamp) {
				2057	adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
				2058	tsc_timestamp = tsc;
				2059	}
				2060	}
				2061
				2062	local_irq_restore(flags);
				2063
				2064	/* With all the info we got, fill in the values */
				2065
				2066	if (kvm_has_tsc_control)
				2067	tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
				2068
				2069	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
				2070	kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				2071	&vcpu->hv_clock.tsc_shift,
				2072	&vcpu->hv_clock.tsc_to_system_mul);
				2073	vcpu->hw_tsc_khz = tgt_tsc_khz;
				2074	}
				2075
				2076	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
				2077	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
				2078	vcpu->last_guest_tsc = tsc_timestamp;
				2079
				2080	/* If the host uses TSC clocksource, then it is stable */
				2081	pvclock_flags = 0;
				2082	if (use_master_clock)
				2083	pvclock_flags \|= PVCLOCK_TSC_STABLE_BIT;
				2084
				2085	vcpu->hv_clock.flags = pvclock_flags;
				2086
				2087	if (vcpu->pv_time_enabled)
				2088	kvm_setup_pvclock_page(v);
				2089	if (v == kvm_get_vcpu(v->kvm, 0))
				2090	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
				2091	return 0;
				2092	}
				2093
				2094	/*
				2095	* kvmclock updates which are isolated to a given vcpu, such as
				2096	* vcpu->cpu migration, should not allow system_timestamp from
				2097	* the rest of the vcpus to remain static. Otherwise ntp frequency
				2098	* correction applies to one vcpu's system_timestamp but not
				2099	* the others.
				2100	*
				2101	* So in those cases, request a kvmclock update for all vcpus.
				2102	* We need to rate-limit these requests though, as they can
				2103	* considerably slow guests that have a large number of vcpus.
				2104	* The time for a remote vcpu to update its kvmclock is bound
				2105	* by the delay we use to rate-limit the updates.
				2106	*/
				2107
				2108	#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
				2109
				2110	static void kvmclock_update_fn(struct work_struct *work)
				2111	{
				2112	int i;
				2113	struct delayed_work *dwork = to_delayed_work(work);
				2114	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
				2115	kvmclock_update_work);
				2116	struct kvm *kvm = container_of(ka, struct kvm, arch);
				2117	struct kvm_vcpu *vcpu;
				2118
				2119	kvm_for_each_vcpu(i, vcpu, kvm) {
				2120	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				2121	kvm_vcpu_kick(vcpu);
				2122	}
				2123	}
				2124
				2125	static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
				2126	{
				2127	struct kvm *kvm = v->kvm;
				2128
				2129	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
				2130	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
				2131	KVMCLOCK_UPDATE_DELAY);
				2132	}
				2133
				2134	#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
				2135
				2136	static void kvmclock_sync_fn(struct work_struct *work)
				2137	{
				2138	struct delayed_work *dwork = to_delayed_work(work);
				2139	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
				2140	kvmclock_sync_work);
				2141	struct kvm *kvm = container_of(ka, struct kvm, arch);
				2142
				2143	if (!kvmclock_periodic_sync)
				2144	return;
				2145
				2146	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
				2147	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
				2148	KVMCLOCK_SYNC_PERIOD);
				2149	}
				2150
				2151	static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
				2152	{
				2153	u64 mcg_cap = vcpu->arch.mcg_cap;
				2154	unsigned bank_num = mcg_cap & 0xff;
				2155
				2156	switch (msr) {
				2157	case MSR_IA32_MCG_STATUS:
				2158	vcpu->arch.mcg_status = data;
				2159	break;
				2160	case MSR_IA32_MCG_CTL:
				2161	if (!(mcg_cap & MCG_CTL_P))
				2162	return 1;
				2163	if (data != 0 && data != ~(u64)0)
				2164	return -1;
				2165	vcpu->arch.mcg_ctl = data;
				2166	break;
				2167	default:
				2168	if (msr >= MSR_IA32_MC0_CTL &&
				2169	msr < MSR_IA32_MCx_CTL(bank_num)) {
				2170	u32 offset = array_index_nospec(
				2171	msr - MSR_IA32_MC0_CTL,
				2172	MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
				2173
				2174	/* only 0 or all 1s can be written to IA32_MCi_CTL
				2175	* some Linux kernels though clear bit 10 in bank 4 to
				2176	* workaround a BIOS/GART TBL issue on AMD K8s, ignore
				2177	* this to avoid an uncatched #GP in the guest
				2178	*/
				2179	if ((offset & 0x3) == 0 &&
				2180	data != 0 && (data \| (1 << 10)) != ~(u64)0)
				2181	return -1;
				2182	vcpu->arch.mce_banks[offset] = data;
				2183	break;
				2184	}
				2185	return 1;
				2186	}
				2187	return 0;
				2188	}
				2189
				2190	static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
				2191	{
				2192	struct kvm *kvm = vcpu->kvm;
				2193	int lm = is_long_mode(vcpu);
				2194	u8 blob_addr = lm ? (u8 )(long)kvm->arch.xen_hvm_config.blob_addr_64
				2195	: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
				2196	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
				2197	: kvm->arch.xen_hvm_config.blob_size_32;
				2198	u32 page_num = data & ~PAGE_MASK;
				2199	u64 page_addr = data & PAGE_MASK;
				2200	u8 *page;
				2201	int r;
				2202
				2203	r = -E2BIG;
				2204	if (page_num >= blob_size)
				2205	goto out;
				2206	r = -ENOMEM;
				2207	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
				2208	if (IS_ERR(page)) {
				2209	r = PTR_ERR(page);
				2210	goto out;
				2211	}
				2212	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
				2213	goto out_free;
				2214	r = 0;
				2215	out_free:
				2216	kfree(page);
				2217	out:
				2218	return r;
				2219	}
				2220
				2221	static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
				2222	{
				2223	gpa_t gpa = data & ~0x3f;
				2224
				2225	/* Bits 3:5 are reserved, Should be zero */
				2226	if (data & 0x38)
				2227	return 1;
				2228
				2229	vcpu->arch.apf.msr_val = data;
				2230
				2231	if (!(data & KVM_ASYNC_PF_ENABLED)) {
				2232	kvm_clear_async_pf_completion_queue(vcpu);
				2233	kvm_async_pf_hash_reset(vcpu);
				2234	return 0;
				2235	}
				2236
				2237	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
				2238	sizeof(u32)))
				2239	return 1;
				2240
				2241	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
				2242	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
				2243	kvm_async_pf_wakeup_all(vcpu);
				2244	return 0;
				2245	}
				2246
				2247	static void kvmclock_reset(struct kvm_vcpu *vcpu)
				2248	{
				2249	vcpu->arch.pv_time_enabled = false;
				2250	}
				2251
				2252	static void record_steal_time(struct kvm_vcpu *vcpu)
				2253	{
				2254	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
				2255	return;
				2256
				2257	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
				2258	&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
				2259	return;
				2260
				2261	vcpu->arch.st.steal.preempted = 0;
				2262
				2263	if (vcpu->arch.st.steal.version & 1)
				2264	vcpu->arch.st.steal.version += 1; /* first time write, random junk */
				2265
				2266	vcpu->arch.st.steal.version += 1;
				2267
				2268	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
				2269	&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
				2270
				2271	smp_wmb();
				2272
				2273	vcpu->arch.st.steal.steal += current->sched_info.run_delay -
				2274	vcpu->arch.st.last_steal;
				2275	vcpu->arch.st.last_steal = current->sched_info.run_delay;
				2276
				2277	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
				2278	&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
				2279
				2280	smp_wmb();
				2281
				2282	vcpu->arch.st.steal.version += 1;
				2283
				2284	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
				2285	&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
				2286	}
				2287
				2288	int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
				2289	{
				2290	bool pr = false;
				2291	u32 msr = msr_info->index;
				2292	u64 data = msr_info->data;
				2293
				2294	switch (msr) {
				2295	case MSR_AMD64_NB_CFG:
				2296	case MSR_IA32_UCODE_WRITE:
				2297	case MSR_VM_HSAVE_PA:
				2298	case MSR_AMD64_PATCH_LOADER:
				2299	case MSR_AMD64_BU_CFG2:
				2300	case MSR_AMD64_DC_CFG:
				2301	case MSR_F15H_EX_CFG:
				2302	break;
				2303
				2304	case MSR_IA32_UCODE_REV:
				2305	if (msr_info->host_initiated)
				2306	vcpu->arch.microcode_version = data;
				2307	break;
				2308	case MSR_IA32_ARCH_CAPABILITIES:
				2309	if (!msr_info->host_initiated)
				2310	return 1;
				2311	vcpu->arch.arch_capabilities = data;
				2312	break;
				2313	case MSR_EFER:
				2314	return set_efer(vcpu, msr_info);
				2315	case MSR_K7_HWCR:
				2316	data &= ~(u64)0x40; /* ignore flush filter disable */
				2317	data &= ~(u64)0x100; /* ignore ignne emulation enable */
				2318	data &= ~(u64)0x8; /* ignore TLB cache disable */
				2319	data &= ~(u64)0x40000; /* ignore Mc status write enable */
				2320	if (data != 0) {
				2321	vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
				2322	data);
				2323	return 1;
				2324	}
				2325	break;
				2326	case MSR_FAM10H_MMIO_CONF_BASE:
				2327	if (data != 0) {
				2328	vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
				2329	"0x%llx\n", data);
				2330	return 1;
				2331	}
				2332	break;
				2333	case MSR_IA32_DEBUGCTLMSR:
				2334	if (!data) {
				2335	/* We support the non-activated case already */
				2336	break;
				2337	} else if (data & ~(DEBUGCTLMSR_LBR \| DEBUGCTLMSR_BTF)) {
				2338	/* Values other than LBR and BTF are vendor-specific,
				2339	thus reserved and should throw a #GP */
				2340	return 1;
				2341	}
				2342	vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
				2343	__func__, data);
				2344	break;
				2345	case 0x200 ... 0x2ff:
				2346	return kvm_mtrr_set_msr(vcpu, msr, data);
				2347	case MSR_IA32_APICBASE:
				2348	return kvm_set_apic_base(vcpu, msr_info);
				2349	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
				2350	return kvm_x2apic_msr_write(vcpu, msr, data);
				2351	case MSR_IA32_TSCDEADLINE:
				2352	kvm_set_lapic_tscdeadline_msr(vcpu, data);
				2353	break;
				2354	case MSR_IA32_TSC_ADJUST:
				2355	if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
				2356	if (!msr_info->host_initiated) {
				2357	s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
				2358	adjust_tsc_offset_guest(vcpu, adj);
				2359	}
				2360	vcpu->arch.ia32_tsc_adjust_msr = data;
				2361	}
				2362	break;
				2363	case MSR_IA32_MISC_ENABLE:
				2364	vcpu->arch.ia32_misc_enable_msr = data;
				2365	break;
				2366	case MSR_IA32_SMBASE:
				2367	if (!msr_info->host_initiated)
				2368	return 1;
				2369	vcpu->arch.smbase = data;
				2370	break;
				2371	case MSR_KVM_WALL_CLOCK_NEW:
				2372	case MSR_KVM_WALL_CLOCK:
				2373	vcpu->kvm->arch.wall_clock = data;
				2374	kvm_write_wall_clock(vcpu->kvm, data);
				2375	break;
				2376	case MSR_KVM_SYSTEM_TIME_NEW:
				2377	case MSR_KVM_SYSTEM_TIME: {
				2378	struct kvm_arch *ka = &vcpu->kvm->arch;
				2379
				2380	kvmclock_reset(vcpu);
				2381
				2382	if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
				2383	bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
				2384
				2385	if (ka->boot_vcpu_runs_old_kvmclock != tmp)
				2386	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				2387
				2388	ka->boot_vcpu_runs_old_kvmclock = tmp;
				2389	}
				2390
				2391	vcpu->arch.time = data;
				2392	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
				2393
				2394	/* we verify if the enable bit is set... */
				2395	if (!(data & 1))
				2396	break;
				2397
				2398	if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
				2399	&vcpu->arch.pv_time, data & ~1ULL,
				2400	sizeof(struct pvclock_vcpu_time_info)))
				2401	vcpu->arch.pv_time_enabled = false;
				2402	else
				2403	vcpu->arch.pv_time_enabled = true;
				2404
				2405	break;
				2406	}
				2407	case MSR_KVM_ASYNC_PF_EN:
				2408	if (kvm_pv_enable_async_pf(vcpu, data))
				2409	return 1;
				2410	break;
				2411	case MSR_KVM_STEAL_TIME:
				2412
				2413	if (unlikely(!sched_info_on()))
				2414	return 1;
				2415
				2416	if (data & KVM_STEAL_RESERVED_MASK)
				2417	return 1;
				2418
				2419	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
				2420	data & KVM_STEAL_VALID_BITS,
				2421	sizeof(struct kvm_steal_time)))
				2422	return 1;
				2423
				2424	vcpu->arch.st.msr_val = data;
				2425
				2426	if (!(data & KVM_MSR_ENABLED))
				2427	break;
				2428
				2429	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
				2430
				2431	break;
				2432	case MSR_KVM_PV_EOI_EN:
				2433	if (kvm_lapic_enable_pv_eoi(vcpu, data))
				2434	return 1;
				2435	break;
				2436
				2437	case MSR_IA32_MCG_CTL:
				2438	case MSR_IA32_MCG_STATUS:
				2439	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
				2440	return set_msr_mce(vcpu, msr, data);
				2441
				2442	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
				2443	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
				2444	pr = true; /* fall through */
				2445	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
				2446	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
				2447	if (kvm_pmu_is_valid_msr(vcpu, msr))
				2448	return kvm_pmu_set_msr(vcpu, msr_info);
				2449
				2450	if (pr \|\| data != 0)
				2451	vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
				2452	"0x%x data 0x%llx\n", msr, data);
				2453	break;
				2454	case MSR_K7_CLK_CTL:
				2455	/*
				2456	* Ignore all writes to this no longer documented MSR.
				2457	* Writes are only relevant for old K7 processors,
				2458	* all pre-dating SVM, but a recommended workaround from
				2459	* AMD for these chips. It is possible to specify the
				2460	* affected processor models on the command line, hence
				2461	* the need to ignore the workaround.
				2462	*/
				2463	break;
				2464	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
				2465	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
				2466	case HV_X64_MSR_CRASH_CTL:
				2467	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
				2468	return kvm_hv_set_msr_common(vcpu, msr, data,
				2469	msr_info->host_initiated);
				2470	case MSR_IA32_BBL_CR_CTL3:
				2471	/* Drop writes to this legacy MSR -- see rdmsr
				2472	* counterpart for further detail.
				2473	*/
				2474	vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data);
				2475	break;
				2476	case MSR_AMD64_OSVW_ID_LENGTH:
				2477	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				2478	return 1;
				2479	vcpu->arch.osvw.length = data;
				2480	break;
				2481	case MSR_AMD64_OSVW_STATUS:
				2482	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				2483	return 1;
				2484	vcpu->arch.osvw.status = data;
				2485	break;
				2486	case MSR_PLATFORM_INFO:
				2487	if (!msr_info->host_initiated \|\|
				2488	data & ~MSR_PLATFORM_INFO_CPUID_FAULT \|\|
				2489	(!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
				2490	cpuid_fault_enabled(vcpu)))
				2491	return 1;
				2492	vcpu->arch.msr_platform_info = data;
				2493	break;
				2494	case MSR_MISC_FEATURES_ENABLES:
				2495	if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT \|\|
				2496	(data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
				2497	!supports_cpuid_fault(vcpu)))
				2498	return 1;
				2499	vcpu->arch.msr_misc_features_enables = data;
				2500	break;
				2501	default:
				2502	if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
				2503	return xen_hvm_config(vcpu, data);
				2504	if (kvm_pmu_is_valid_msr(vcpu, msr))
				2505	return kvm_pmu_set_msr(vcpu, msr_info);
				2506	if (!ignore_msrs) {
				2507	vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
				2508	msr, data);
				2509	return 1;
				2510	} else {
				2511	vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
				2512	msr, data);
				2513	break;
				2514	}
				2515	}
				2516	return 0;
				2517	}
				2518	EXPORT_SYMBOL_GPL(kvm_set_msr_common);
				2519
				2520
				2521	/*
				2522	* Reads an msr value (of 'msr_index') into 'pdata'.
				2523	* Returns 0 on success, non-0 otherwise.
				2524	* Assumes vcpu_load() was already called.
				2525	*/
				2526	int kvm_get_msr(struct kvm_vcpu vcpu, struct msr_data msr)
				2527	{
				2528	return kvm_x86_ops->get_msr(vcpu, msr);
				2529	}
				2530	EXPORT_SYMBOL_GPL(kvm_get_msr);
				2531
				2532	static int get_msr_mce(struct kvm_vcpu vcpu, u32 msr, u64 pdata)
				2533	{
				2534	u64 data;
				2535	u64 mcg_cap = vcpu->arch.mcg_cap;
				2536	unsigned bank_num = mcg_cap & 0xff;
				2537
				2538	switch (msr) {
				2539	case MSR_IA32_P5_MC_ADDR:
				2540	case MSR_IA32_P5_MC_TYPE:
				2541	data = 0;
				2542	break;
				2543	case MSR_IA32_MCG_CAP:
				2544	data = vcpu->arch.mcg_cap;
				2545	break;
				2546	case MSR_IA32_MCG_CTL:
				2547	if (!(mcg_cap & MCG_CTL_P))
				2548	return 1;
				2549	data = vcpu->arch.mcg_ctl;
				2550	break;
				2551	case MSR_IA32_MCG_STATUS:
				2552	data = vcpu->arch.mcg_status;
				2553	break;
				2554	default:
				2555	if (msr >= MSR_IA32_MC0_CTL &&
				2556	msr < MSR_IA32_MCx_CTL(bank_num)) {
				2557	u32 offset = array_index_nospec(
				2558	msr - MSR_IA32_MC0_CTL,
				2559	MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
				2560
				2561	data = vcpu->arch.mce_banks[offset];
				2562	break;
				2563	}
				2564	return 1;
				2565	}
				2566	*pdata = data;
				2567	return 0;
				2568	}
				2569
				2570	int kvm_get_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
				2571	{
				2572	switch (msr_info->index) {
				2573	case MSR_IA32_PLATFORM_ID:
				2574	case MSR_IA32_EBL_CR_POWERON:
				2575	case MSR_IA32_DEBUGCTLMSR:
				2576	case MSR_IA32_LASTBRANCHFROMIP:
				2577	case MSR_IA32_LASTBRANCHTOIP:
				2578	case MSR_IA32_LASTINTFROMIP:
				2579	case MSR_IA32_LASTINTTOIP:
				2580	case MSR_K8_SYSCFG:
				2581	case MSR_K8_TSEG_ADDR:
				2582	case MSR_K8_TSEG_MASK:
				2583	case MSR_K7_HWCR:
				2584	case MSR_VM_HSAVE_PA:
				2585	case MSR_K8_INT_PENDING_MSG:
				2586	case MSR_AMD64_NB_CFG:
				2587	case MSR_FAM10H_MMIO_CONF_BASE:
				2588	case MSR_AMD64_BU_CFG2:
				2589	case MSR_IA32_PERF_CTL:
				2590	case MSR_AMD64_DC_CFG:
				2591	case MSR_F15H_EX_CFG:
				2592	msr_info->data = 0;
				2593	break;
				2594	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
				2595	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
				2596	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
				2597	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
				2598	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
				2599	return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
				2600	msr_info->data = 0;
				2601	break;
				2602	case MSR_IA32_UCODE_REV:
				2603	msr_info->data = vcpu->arch.microcode_version;
				2604	break;
				2605	case MSR_IA32_ARCH_CAPABILITIES:
				2606	if (!msr_info->host_initiated &&
				2607	!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
				2608	return 1;
				2609	msr_info->data = vcpu->arch.arch_capabilities;
				2610	break;
				2611	case MSR_MTRRcap:
				2612	case 0x200 ... 0x2ff:
				2613	return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
				2614	case 0xcd: /* fsb frequency */
				2615	msr_info->data = 3;
				2616	break;
				2617	/*
				2618	* MSR_EBC_FREQUENCY_ID
				2619	* Conservative value valid for even the basic CPU models.
				2620	* Models 0,1: 000 in bits 23:21 indicating a bus speed of
				2621	* 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
				2622	* and 266MHz for model 3, or 4. Set Core Clock
				2623	* Frequency to System Bus Frequency Ratio to 1 (bits
				2624	* 31:24) even though these are only valid for CPU
				2625	* models > 2, however guests may end up dividing or
				2626	* multiplying by zero otherwise.
				2627	*/
				2628	case MSR_EBC_FREQUENCY_ID:
				2629	msr_info->data = 1 << 24;
				2630	break;
				2631	case MSR_IA32_APICBASE:
				2632	msr_info->data = kvm_get_apic_base(vcpu);
				2633	break;
				2634	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
				2635	return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
				2636	break;
				2637	case MSR_IA32_TSCDEADLINE:
				2638	msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
				2639	break;
				2640	case MSR_IA32_TSC_ADJUST:
				2641	msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
				2642	break;
				2643	case MSR_IA32_MISC_ENABLE:
				2644	msr_info->data = vcpu->arch.ia32_misc_enable_msr;
				2645	break;
				2646	case MSR_IA32_SMBASE:
				2647	if (!msr_info->host_initiated)
				2648	return 1;
				2649	msr_info->data = vcpu->arch.smbase;
				2650	break;
				2651	case MSR_IA32_PERF_STATUS:
				2652	/* TSC increment by tick */
				2653	msr_info->data = 1000ULL;
				2654	/* CPU multiplier */
				2655	msr_info->data \|= (((uint64_t)4ULL) << 40);
				2656	break;
				2657	case MSR_EFER:
				2658	msr_info->data = vcpu->arch.efer;
				2659	break;
				2660	case MSR_KVM_WALL_CLOCK:
				2661	case MSR_KVM_WALL_CLOCK_NEW:
				2662	msr_info->data = vcpu->kvm->arch.wall_clock;
				2663	break;
				2664	case MSR_KVM_SYSTEM_TIME:
				2665	case MSR_KVM_SYSTEM_TIME_NEW:
				2666	msr_info->data = vcpu->arch.time;
				2667	break;
				2668	case MSR_KVM_ASYNC_PF_EN:
				2669	msr_info->data = vcpu->arch.apf.msr_val;
				2670	break;
				2671	case MSR_KVM_STEAL_TIME:
				2672	msr_info->data = vcpu->arch.st.msr_val;
				2673	break;
				2674	case MSR_KVM_PV_EOI_EN:
				2675	msr_info->data = vcpu->arch.pv_eoi.msr_val;
				2676	break;
				2677	case MSR_IA32_P5_MC_ADDR:
				2678	case MSR_IA32_P5_MC_TYPE:
				2679	case MSR_IA32_MCG_CAP:
				2680	case MSR_IA32_MCG_CTL:
				2681	case MSR_IA32_MCG_STATUS:
				2682	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
				2683	return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
				2684	case MSR_K7_CLK_CTL:
				2685	/*
				2686	* Provide expected ramp-up count for K7. All other
				2687	* are set to zero, indicating minimum divisors for
				2688	* every field.
				2689	*
				2690	* This prevents guest kernels on AMD host with CPU
				2691	* type 6, model 8 and higher from exploding due to
				2692	* the rdmsr failing.
				2693	*/
				2694	msr_info->data = 0x20000000;
				2695	break;
				2696	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
				2697	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
				2698	case HV_X64_MSR_CRASH_CTL:
				2699	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
				2700	return kvm_hv_get_msr_common(vcpu,
				2701	msr_info->index, &msr_info->data);
				2702	break;
				2703	case MSR_IA32_BBL_CR_CTL3:
				2704	/* This legacy MSR exists but isn't fully documented in current
				2705	* silicon. It is however accessed by winxp in very narrow
				2706	* scenarios where it sets bit #19, itself documented as
				2707	* a "reserved" bit. Best effort attempt to source coherent
				2708	* read data here should the balance of the register be
				2709	* interpreted by the guest:
				2710	*
				2711	* L2 cache control register 3: 64GB range, 256KB size,
				2712	* enabled, latency 0x1, configured
				2713	*/
				2714	msr_info->data = 0xbe702111;
				2715	break;
				2716	case MSR_AMD64_OSVW_ID_LENGTH:
				2717	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				2718	return 1;
				2719	msr_info->data = vcpu->arch.osvw.length;
				2720	break;
				2721	case MSR_AMD64_OSVW_STATUS:
				2722	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				2723	return 1;
				2724	msr_info->data = vcpu->arch.osvw.status;
				2725	break;
				2726	case MSR_PLATFORM_INFO:
				2727	msr_info->data = vcpu->arch.msr_platform_info;
				2728	break;
				2729	case MSR_MISC_FEATURES_ENABLES:
				2730	msr_info->data = vcpu->arch.msr_misc_features_enables;
				2731	break;
				2732	default:
				2733	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
				2734	return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
				2735	if (!ignore_msrs) {
				2736	vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
				2737	msr_info->index);
				2738	return 1;
				2739	} else {
				2740	vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
				2741	msr_info->data = 0;
				2742	}
				2743	break;
				2744	}
				2745	return 0;
				2746	}
				2747	EXPORT_SYMBOL_GPL(kvm_get_msr_common);
				2748
				2749	/*
				2750	* Read or write a bunch of msrs. All parameters are kernel addresses.
				2751	*
				2752	* @return number of msrs set successfully.
				2753	*/
				2754	static int __msr_io(struct kvm_vcpu vcpu, struct kvm_msrs msrs,
				2755	struct kvm_msr_entry *entries,
				2756	int (do_msr)(struct kvm_vcpu vcpu,
				2757	unsigned index, u64 *data))
				2758	{
				2759	int i;
				2760
				2761	for (i = 0; i < msrs->nmsrs; ++i)
				2762	if (do_msr(vcpu, entries[i].index, &entries[i].data))
				2763	break;
				2764
				2765	return i;
				2766	}
				2767
				2768	/*
				2769	* Read or write a bunch of msrs. Parameters are user addresses.
				2770	*
				2771	* @return number of msrs set successfully.
				2772	*/
				2773	static int msr_io(struct kvm_vcpu vcpu, struct kvm_msrs __user user_msrs,
				2774	int (do_msr)(struct kvm_vcpu vcpu,
				2775	unsigned index, u64 *data),
				2776	int writeback)
				2777	{
				2778	struct kvm_msrs msrs;
				2779	struct kvm_msr_entry *entries;
				2780	int r, n;
				2781	unsigned size;
				2782
				2783	r = -EFAULT;
				2784	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
				2785	goto out;
				2786
				2787	r = -E2BIG;
				2788	if (msrs.nmsrs >= MAX_IO_MSRS)
				2789	goto out;
				2790
				2791	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
				2792	entries = memdup_user(user_msrs->entries, size);
				2793	if (IS_ERR(entries)) {
				2794	r = PTR_ERR(entries);
				2795	goto out;
				2796	}
				2797
				2798	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
				2799	if (r < 0)
				2800	goto out_free;
				2801
				2802	r = -EFAULT;
				2803	if (writeback && copy_to_user(user_msrs->entries, entries, size))
				2804	goto out_free;
				2805
				2806	r = n;
				2807
				2808	out_free:
				2809	kfree(entries);
				2810	out:
				2811	return r;
				2812	}
				2813
				2814	int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
				2815	{
				2816	int r;
				2817
				2818	switch (ext) {
				2819	case KVM_CAP_IRQCHIP:
				2820	case KVM_CAP_HLT:
				2821	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
				2822	case KVM_CAP_SET_TSS_ADDR:
				2823	case KVM_CAP_EXT_CPUID:
				2824	case KVM_CAP_EXT_EMUL_CPUID:
				2825	case KVM_CAP_CLOCKSOURCE:
				2826	case KVM_CAP_PIT:
				2827	case KVM_CAP_NOP_IO_DELAY:
				2828	case KVM_CAP_MP_STATE:
				2829	case KVM_CAP_SYNC_MMU:
				2830	case KVM_CAP_USER_NMI:
				2831	case KVM_CAP_REINJECT_CONTROL:
				2832	case KVM_CAP_IRQ_INJECT_STATUS:
				2833	case KVM_CAP_IOEVENTFD:
				2834	case KVM_CAP_IOEVENTFD_NO_LENGTH:
				2835	case KVM_CAP_PIT2:
				2836	case KVM_CAP_PIT_STATE2:
				2837	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
				2838	case KVM_CAP_XEN_HVM:
				2839	case KVM_CAP_VCPU_EVENTS:
				2840	case KVM_CAP_HYPERV:
				2841	case KVM_CAP_HYPERV_VAPIC:
				2842	case KVM_CAP_HYPERV_SPIN:
				2843	case KVM_CAP_HYPERV_SYNIC:
				2844	case KVM_CAP_HYPERV_SYNIC2:
				2845	case KVM_CAP_HYPERV_VP_INDEX:
				2846	case KVM_CAP_PCI_SEGMENT:
				2847	case KVM_CAP_DEBUGREGS:
				2848	case KVM_CAP_X86_ROBUST_SINGLESTEP:
				2849	case KVM_CAP_XSAVE:
				2850	case KVM_CAP_ASYNC_PF:
				2851	case KVM_CAP_GET_TSC_KHZ:
				2852	case KVM_CAP_KVMCLOCK_CTRL:
				2853	case KVM_CAP_READONLY_MEM:
				2854	case KVM_CAP_HYPERV_TIME:
				2855	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
				2856	case KVM_CAP_TSC_DEADLINE_TIMER:
				2857	case KVM_CAP_ENABLE_CAP_VM:
				2858	case KVM_CAP_DISABLE_QUIRKS:
				2859	case KVM_CAP_SET_BOOT_CPU_ID:
				2860	case KVM_CAP_SPLIT_IRQCHIP:
				2861	case KVM_CAP_IMMEDIATE_EXIT:
				2862	case KVM_CAP_GET_MSR_FEATURES:
				2863	r = 1;
				2864	break;
				2865	case KVM_CAP_ADJUST_CLOCK:
				2866	r = KVM_CLOCK_TSC_STABLE;
				2867	break;
				2868	case KVM_CAP_X86_GUEST_MWAIT:
				2869	r = kvm_mwait_in_guest();
				2870	break;
				2871	case KVM_CAP_X86_SMM:
				2872	/* SMBASE is usually relocated above 1M on modern chipsets,
				2873	* and SMM handlers might indeed rely on 4G segment limits,
				2874	* so do not report SMM to be available if real mode is
				2875	* emulated via vm86 mode. Still, do not go to great lengths
				2876	* to avoid userspace's usage of the feature, because it is a
				2877	* fringe case that is not enabled except via specific settings
				2878	* of the module parameters.
				2879	*/
				2880	r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
				2881	break;
				2882	case KVM_CAP_VAPIC:
				2883	r = !kvm_x86_ops->cpu_has_accelerated_tpr();
				2884	break;
				2885	case KVM_CAP_NR_VCPUS:
				2886	r = KVM_SOFT_MAX_VCPUS;
				2887	break;
				2888	case KVM_CAP_MAX_VCPUS:
				2889	r = KVM_MAX_VCPUS;
				2890	break;
				2891	case KVM_CAP_MAX_VCPU_ID:
				2892	r = KVM_MAX_VCPU_ID;
				2893	break;
				2894	case KVM_CAP_NR_MEMSLOTS:
				2895	r = KVM_USER_MEM_SLOTS;
				2896	break;
				2897	case KVM_CAP_PV_MMU: /* obsolete */
				2898	r = 0;
				2899	break;
				2900	case KVM_CAP_MCE:
				2901	r = KVM_MAX_MCE_BANKS;
				2902	break;
				2903	case KVM_CAP_XCRS:
				2904	r = boot_cpu_has(X86_FEATURE_XSAVE);
				2905	break;
				2906	case KVM_CAP_TSC_CONTROL:
				2907	r = kvm_has_tsc_control;
				2908	break;
				2909	case KVM_CAP_X2APIC_API:
				2910	r = KVM_X2APIC_API_VALID_FLAGS;
				2911	break;
				2912	default:
				2913	r = 0;
				2914	break;
				2915	}
				2916	return r;
				2917
				2918	}
				2919
				2920	long kvm_arch_dev_ioctl(struct file *filp,
				2921	unsigned int ioctl, unsigned long arg)
				2922	{
				2923	void __user argp = (void __user )arg;
				2924	long r;
				2925
				2926	switch (ioctl) {
				2927	case KVM_GET_MSR_INDEX_LIST: {
				2928	struct kvm_msr_list __user *user_msr_list = argp;
				2929	struct kvm_msr_list msr_list;
				2930	unsigned n;
				2931
				2932	r = -EFAULT;
				2933	if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
				2934	goto out;
				2935	n = msr_list.nmsrs;
				2936	msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
				2937	if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
				2938	goto out;
				2939	r = -E2BIG;
				2940	if (n < msr_list.nmsrs)
				2941	goto out;
				2942	r = -EFAULT;
				2943	if (copy_to_user(user_msr_list->indices, &msrs_to_save,
				2944	num_msrs_to_save * sizeof(u32)))
				2945	goto out;
				2946	if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
				2947	&emulated_msrs,
				2948	num_emulated_msrs * sizeof(u32)))
				2949	goto out;
				2950	r = 0;
				2951	break;
				2952	}
				2953	case KVM_GET_SUPPORTED_CPUID:
				2954	case KVM_GET_EMULATED_CPUID: {
				2955	struct kvm_cpuid2 __user *cpuid_arg = argp;
				2956	struct kvm_cpuid2 cpuid;
				2957
				2958	r = -EFAULT;
				2959	if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
				2960	goto out;
				2961
				2962	r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
				2963	ioctl);
				2964	if (r)
				2965	goto out;
				2966
				2967	r = -EFAULT;
				2968	if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
				2969	goto out;
				2970	r = 0;
				2971	break;
				2972	}
				2973	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
				2974	r = -EFAULT;
				2975	if (copy_to_user(argp, &kvm_mce_cap_supported,
				2976	sizeof(kvm_mce_cap_supported)))
				2977	goto out;
				2978	r = 0;
				2979	break;
				2980	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
				2981	struct kvm_msr_list __user *user_msr_list = argp;
				2982	struct kvm_msr_list msr_list;
				2983	unsigned int n;
				2984
				2985	r = -EFAULT;
				2986	if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
				2987	goto out;
				2988	n = msr_list.nmsrs;
				2989	msr_list.nmsrs = num_msr_based_features;
				2990	if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
				2991	goto out;
				2992	r = -E2BIG;
				2993	if (n < msr_list.nmsrs)
				2994	goto out;
				2995	r = -EFAULT;
				2996	if (copy_to_user(user_msr_list->indices, &msr_based_features,
				2997	num_msr_based_features * sizeof(u32)))
				2998	goto out;
				2999	r = 0;
				3000	break;
				3001	}
				3002	case KVM_GET_MSRS:
				3003	r = msr_io(NULL, argp, do_get_msr_feature, 1);
				3004	break;
				3005	}
				3006	default:
				3007	r = -EINVAL;
				3008	}
				3009	out:
				3010	return r;
				3011	}
				3012
				3013	static void wbinvd_ipi(void *garbage)
				3014	{
				3015	wbinvd();
				3016	}
				3017
				3018	static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
				3019	{
				3020	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
				3021	}
				3022
				3023	void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
				3024	{
				3025	/* Address WBINVD may be executed by guest */
				3026	if (need_emulate_wbinvd(vcpu)) {
				3027	if (kvm_x86_ops->has_wbinvd_exit())
				3028	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
				3029	else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
				3030	smp_call_function_single(vcpu->cpu,
				3031	wbinvd_ipi, NULL, 1);
				3032	}
				3033
				3034	kvm_x86_ops->vcpu_load(vcpu, cpu);
				3035
				3036	/* Apply any externally detected TSC adjustments (due to suspend) */
				3037	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
				3038	adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
				3039	vcpu->arch.tsc_offset_adjustment = 0;
				3040	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				3041	}
				3042
				3043	if (unlikely(vcpu->cpu != cpu) \|\| check_tsc_unstable()) {
				3044	s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
				3045	rdtsc() - vcpu->arch.last_host_tsc;
				3046	if (tsc_delta < 0)
				3047	mark_tsc_unstable("KVM discovered backwards TSC");
				3048
				3049	if (check_tsc_unstable()) {
				3050	u64 offset = kvm_compute_tsc_offset(vcpu,
				3051	vcpu->arch.last_guest_tsc);
				3052	kvm_vcpu_write_tsc_offset(vcpu, offset);
				3053	vcpu->arch.tsc_catchup = 1;
				3054	}
				3055
				3056	if (kvm_lapic_hv_timer_in_use(vcpu))
				3057	kvm_lapic_restart_hv_timer(vcpu);
				3058
				3059	/*
				3060	* On a host with synchronized TSC, there is no need to update
				3061	* kvmclock on vcpu->cpu migration
				3062	*/
				3063	if (!vcpu->kvm->arch.use_master_clock \|\| vcpu->cpu == -1)
				3064	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
				3065	if (vcpu->cpu != cpu)
				3066	kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
				3067	vcpu->cpu = cpu;
				3068	}
				3069
				3070	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
				3071	}
				3072
				3073	static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
				3074	{
				3075	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
				3076	return;
				3077
				3078	if (vcpu->arch.st.steal.preempted)
				3079	return;
				3080
				3081	vcpu->arch.st.steal.preempted = 1;
				3082
				3083	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
				3084	&vcpu->arch.st.steal.preempted,
				3085	offsetof(struct kvm_steal_time, preempted),
				3086	sizeof(vcpu->arch.st.steal.preempted));
				3087	}
				3088
				3089	void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
				3090	{
				3091	int idx;
				3092
				3093	if (vcpu->preempted)
				3094	vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
				3095
				3096	/*
				3097	* Disable page faults because we're in atomic context here.
				3098	* kvm_write_guest_offset_cached() would call might_fault()
				3099	* that relies on pagefault_disable() to tell if there's a
				3100	* bug. NOTE: the write to guest memory may not go through if
				3101	* during postcopy live migration or if there's heavy guest
				3102	* paging.
				3103	*/
				3104	pagefault_disable();
				3105	/*
				3106	* kvm_memslots() will be called by
				3107	* kvm_write_guest_offset_cached() so take the srcu lock.
				3108	*/
				3109	idx = srcu_read_lock(&vcpu->kvm->srcu);
				3110	kvm_steal_time_set_preempted(vcpu);
				3111	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				3112	pagefault_enable();
				3113	kvm_x86_ops->vcpu_put(vcpu);
				3114	vcpu->arch.last_host_tsc = rdtsc();
				3115	/*
				3116	* If userspace has set any breakpoints or watchpoints, dr6 is restored
				3117	* on every vmexit, but if not, we might have a stale dr6 from the
				3118	* guest. do_debug expects dr6 to be cleared after it runs, do the same.
				3119	*/
				3120	set_debugreg(0, 6);
				3121	}
				3122
				3123	static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
				3124	struct kvm_lapic_state *s)
				3125	{
				3126	if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
				3127	kvm_x86_ops->sync_pir_to_irr(vcpu);
				3128
				3129	return kvm_apic_get_state(vcpu, s);
				3130	}
				3131
				3132	static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
				3133	struct kvm_lapic_state *s)
				3134	{
				3135	int r;
				3136
				3137	r = kvm_apic_set_state(vcpu, s);
				3138	if (r)
				3139	return r;
				3140	update_cr8_intercept(vcpu);
				3141
				3142	return 0;
				3143	}
				3144
				3145	static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
				3146	{
				3147	return (!lapic_in_kernel(vcpu) \|\|
				3148	kvm_apic_accept_pic_intr(vcpu));
				3149	}
				3150
				3151	/*
				3152	* if userspace requested an interrupt window, check that the
				3153	* interrupt window is open.
				3154	*
				3155	* No need to exit to userspace if we already have an interrupt queued.
				3156	*/
				3157	static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
				3158	{
				3159	return kvm_arch_interrupt_allowed(vcpu) &&
				3160	!kvm_cpu_has_interrupt(vcpu) &&
				3161	!kvm_event_needs_reinjection(vcpu) &&
				3162	kvm_cpu_accept_dm_intr(vcpu);
				3163	}
				3164
				3165	static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
				3166	struct kvm_interrupt *irq)
				3167	{
				3168	if (irq->irq >= KVM_NR_INTERRUPTS)
				3169	return -EINVAL;
				3170
				3171	if (!irqchip_in_kernel(vcpu->kvm)) {
				3172	kvm_queue_interrupt(vcpu, irq->irq, false);
				3173	kvm_make_request(KVM_REQ_EVENT, vcpu);
				3174	return 0;
				3175	}
				3176
				3177	/*
				3178	* With in-kernel LAPIC, we only use this to inject EXTINT, so
				3179	* fail for in-kernel 8259.
				3180	*/
				3181	if (pic_in_kernel(vcpu->kvm))
				3182	return -ENXIO;
				3183
				3184	if (vcpu->arch.pending_external_vector != -1)
				3185	return -EEXIST;
				3186
				3187	vcpu->arch.pending_external_vector = irq->irq;
				3188	kvm_make_request(KVM_REQ_EVENT, vcpu);
				3189	return 0;
				3190	}
				3191
				3192	static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
				3193	{
				3194	kvm_inject_nmi(vcpu);
				3195
				3196	return 0;
				3197	}
				3198
				3199	static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
				3200	{
				3201	kvm_make_request(KVM_REQ_SMI, vcpu);
				3202
				3203	return 0;
				3204	}
				3205
				3206	static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
				3207	struct kvm_tpr_access_ctl *tac)
				3208	{
				3209	if (tac->flags)
				3210	return -EINVAL;
				3211	vcpu->arch.tpr_access_reporting = !!tac->enabled;
				3212	return 0;
				3213	}
				3214
				3215	static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
				3216	u64 mcg_cap)
				3217	{
				3218	int r;
				3219	unsigned bank_num = mcg_cap & 0xff, bank;
				3220
				3221	r = -EINVAL;
				3222	if (!bank_num \|\| bank_num > KVM_MAX_MCE_BANKS)
				3223	goto out;
				3224	if (mcg_cap & ~(kvm_mce_cap_supported \| 0xff \| 0xff0000))
				3225	goto out;
				3226	r = 0;
				3227	vcpu->arch.mcg_cap = mcg_cap;
				3228	/* Init IA32_MCG_CTL to all 1s */
				3229	if (mcg_cap & MCG_CTL_P)
				3230	vcpu->arch.mcg_ctl = ~(u64)0;
				3231	/* Init IA32_MCi_CTL to all 1s */
				3232	for (bank = 0; bank < bank_num; bank++)
				3233	vcpu->arch.mce_banks[bank*4] = ~(u64)0;
				3234
				3235	if (kvm_x86_ops->setup_mce)
				3236	kvm_x86_ops->setup_mce(vcpu);
				3237	out:
				3238	return r;
				3239	}
				3240
				3241	static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
				3242	struct kvm_x86_mce *mce)
				3243	{
				3244	u64 mcg_cap = vcpu->arch.mcg_cap;
				3245	unsigned bank_num = mcg_cap & 0xff;
				3246	u64 *banks = vcpu->arch.mce_banks;
				3247
				3248	if (mce->bank >= bank_num \|\| !(mce->status & MCI_STATUS_VAL))
				3249	return -EINVAL;
				3250	/*
				3251	* if IA32_MCG_CTL is not all 1s, the uncorrected error
				3252	* reporting is disabled
				3253	*/
				3254	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
				3255	vcpu->arch.mcg_ctl != ~(u64)0)
				3256	return 0;
				3257	banks += 4 * mce->bank;
				3258	/*
				3259	* if IA32_MCi_CTL is not all 1s, the uncorrected error
				3260	* reporting is disabled for the bank
				3261	*/
				3262	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
				3263	return 0;
				3264	if (mce->status & MCI_STATUS_UC) {
				3265	if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) \|\|
				3266	!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
				3267	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				3268	return 0;
				3269	}
				3270	if (banks[1] & MCI_STATUS_VAL)
				3271	mce->status \|= MCI_STATUS_OVER;
				3272	banks[2] = mce->addr;
				3273	banks[3] = mce->misc;
				3274	vcpu->arch.mcg_status = mce->mcg_status;
				3275	banks[1] = mce->status;
				3276	kvm_queue_exception(vcpu, MC_VECTOR);
				3277	} else if (!(banks[1] & MCI_STATUS_VAL)
				3278	\|\| !(banks[1] & MCI_STATUS_UC)) {
				3279	if (banks[1] & MCI_STATUS_VAL)
				3280	mce->status \|= MCI_STATUS_OVER;
				3281	banks[2] = mce->addr;
				3282	banks[3] = mce->misc;
				3283	banks[1] = mce->status;
				3284	} else
				3285	banks[1] \|= MCI_STATUS_OVER;
				3286	return 0;
				3287	}
				3288
				3289	static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
				3290	struct kvm_vcpu_events *events)
				3291	{
				3292	process_nmi(vcpu);
				3293	/*
				3294	* FIXME: pass injected and pending separately. This is only
				3295	* needed for nested virtualization, whose state cannot be
				3296	* migrated yet. For now we can combine them.
				3297	*/
				3298	events->exception.injected =
				3299	(vcpu->arch.exception.pending \|\|
				3300	vcpu->arch.exception.injected) &&
				3301	!kvm_exception_is_soft(vcpu->arch.exception.nr);
				3302	events->exception.nr = vcpu->arch.exception.nr;
				3303	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
				3304	events->exception.pad = 0;
				3305	events->exception.error_code = vcpu->arch.exception.error_code;
				3306
				3307	events->interrupt.injected =
				3308	vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
				3309	events->interrupt.nr = vcpu->arch.interrupt.nr;
				3310	events->interrupt.soft = 0;
				3311	events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
				3312
				3313	events->nmi.injected = vcpu->arch.nmi_injected;
				3314	events->nmi.pending = vcpu->arch.nmi_pending != 0;
				3315	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
				3316	events->nmi.pad = 0;
				3317
				3318	events->sipi_vector = 0; /* never valid when reporting to user space */
				3319
				3320	events->smi.smm = is_smm(vcpu);
				3321	events->smi.pending = vcpu->arch.smi_pending;
				3322	events->smi.smm_inside_nmi =
				3323	!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
				3324	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
				3325
				3326	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
				3327	\| KVM_VCPUEVENT_VALID_SHADOW
				3328	\| KVM_VCPUEVENT_VALID_SMM);
				3329	memset(&events->reserved, 0, sizeof(events->reserved));
				3330	}
				3331
				3332	static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
				3333
				3334	static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
				3335	struct kvm_vcpu_events *events)
				3336	{
				3337	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
				3338	\| KVM_VCPUEVENT_VALID_SIPI_VECTOR
				3339	\| KVM_VCPUEVENT_VALID_SHADOW
				3340	\| KVM_VCPUEVENT_VALID_SMM))
				3341	return -EINVAL;
				3342
				3343	if (events->exception.injected &&
				3344	(events->exception.nr > 31 \|\| events->exception.nr == NMI_VECTOR \|\|
				3345	is_guest_mode(vcpu)))
				3346	return -EINVAL;
				3347
				3348	/* INITs are latched while in SMM */
				3349	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
				3350	(events->smi.smm \|\| events->smi.pending) &&
				3351	vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
				3352	return -EINVAL;
				3353
				3354	process_nmi(vcpu);
				3355	vcpu->arch.exception.injected = false;
				3356	vcpu->arch.exception.pending = events->exception.injected;
				3357	vcpu->arch.exception.nr = events->exception.nr;
				3358	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
				3359	vcpu->arch.exception.error_code = events->exception.error_code;
				3360
				3361	vcpu->arch.interrupt.pending = events->interrupt.injected;
				3362	vcpu->arch.interrupt.nr = events->interrupt.nr;
				3363	vcpu->arch.interrupt.soft = events->interrupt.soft;
				3364	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
				3365	kvm_x86_ops->set_interrupt_shadow(vcpu,
				3366	events->interrupt.shadow);
				3367
				3368	vcpu->arch.nmi_injected = events->nmi.injected;
				3369	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
				3370	vcpu->arch.nmi_pending = events->nmi.pending;
				3371	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
				3372
				3373	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
				3374	lapic_in_kernel(vcpu))
				3375	vcpu->arch.apic->sipi_vector = events->sipi_vector;
				3376
				3377	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
				3378	u32 hflags = vcpu->arch.hflags;
				3379	if (events->smi.smm)
				3380	hflags \|= HF_SMM_MASK;
				3381	else
				3382	hflags &= ~HF_SMM_MASK;
				3383	kvm_set_hflags(vcpu, hflags);
				3384
				3385	vcpu->arch.smi_pending = events->smi.pending;
				3386
				3387	if (events->smi.smm) {
				3388	if (events->smi.smm_inside_nmi)
				3389	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
				3390	else
				3391	vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
				3392	if (lapic_in_kernel(vcpu)) {
				3393	if (events->smi.latched_init)
				3394	set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
				3395	else
				3396	clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
				3397	}
				3398	}
				3399	}
				3400
				3401	kvm_make_request(KVM_REQ_EVENT, vcpu);
				3402
				3403	return 0;
				3404	}
				3405
				3406	static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
				3407	struct kvm_debugregs *dbgregs)
				3408	{
				3409	unsigned long val;
				3410
				3411	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
				3412	kvm_get_dr(vcpu, 6, &val);
				3413	dbgregs->dr6 = val;
				3414	dbgregs->dr7 = vcpu->arch.dr7;
				3415	dbgregs->flags = 0;
				3416	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
				3417	}
				3418
				3419	static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
				3420	struct kvm_debugregs *dbgregs)
				3421	{
				3422	if (dbgregs->flags)
				3423	return -EINVAL;
				3424
				3425	if (dbgregs->dr6 & ~0xffffffffull)
				3426	return -EINVAL;
				3427	if (dbgregs->dr7 & ~0xffffffffull)
				3428	return -EINVAL;
				3429
				3430	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
				3431	kvm_update_dr0123(vcpu);
				3432	vcpu->arch.dr6 = dbgregs->dr6;
				3433	kvm_update_dr6(vcpu);
				3434	vcpu->arch.dr7 = dbgregs->dr7;
				3435	kvm_update_dr7(vcpu);
				3436
				3437	return 0;
				3438	}
				3439
				3440	#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
				3441
				3442	static void fill_xsave(u8 dest, struct kvm_vcpu vcpu)
				3443	{
				3444	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
				3445	u64 xstate_bv = xsave->header.xfeatures;
				3446	u64 valid;
				3447
				3448	/*
				3449	* Copy legacy XSAVE area, to avoid complications with CPUID
				3450	* leaves 0 and 1 in the loop below.
				3451	*/
				3452	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
				3453
				3454	/* Set XSTATE_BV */
				3455	xstate_bv &= vcpu->arch.guest_supported_xcr0 \| XFEATURE_MASK_FPSSE;
				3456	(u64 )(dest + XSAVE_HDR_OFFSET) = xstate_bv;
				3457
				3458	/*
				3459	* Copy each region from the possibly compacted offset to the
				3460	* non-compacted offset.
				3461	*/
				3462	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
				3463	while (valid) {
				3464	u64 feature = valid & -valid;
				3465	int index = fls64(feature) - 1;
				3466	void *src = get_xsave_addr(xsave, feature);
				3467
				3468	if (src) {
				3469	u32 size, offset, ecx, edx;
				3470	cpuid_count(XSTATE_CPUID, index,
				3471	&size, &offset, &ecx, &edx);
				3472	if (feature == XFEATURE_MASK_PKRU)
				3473	memcpy(dest + offset, &vcpu->arch.pkru,
				3474	sizeof(vcpu->arch.pkru));
				3475	else
				3476	memcpy(dest + offset, src, size);
				3477
				3478	}
				3479
				3480	valid -= feature;
				3481	}
				3482	}
				3483
				3484	static void load_xsave(struct kvm_vcpu vcpu, u8 src)
				3485	{
				3486	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
				3487	u64 xstate_bv = (u64 )(src + XSAVE_HDR_OFFSET);
				3488	u64 valid;
				3489
				3490	/*
				3491	* Copy legacy XSAVE area, to avoid complications with CPUID
				3492	* leaves 0 and 1 in the loop below.
				3493	*/
				3494	memcpy(xsave, src, XSAVE_HDR_OFFSET);
				3495
				3496	/* Set XSTATE_BV and possibly XCOMP_BV. */
				3497	xsave->header.xfeatures = xstate_bv;
				3498	if (boot_cpu_has(X86_FEATURE_XSAVES))
				3499	xsave->header.xcomp_bv = host_xcr0 \| XSTATE_COMPACTION_ENABLED;
				3500
				3501	/*
				3502	* Copy each region from the non-compacted offset to the
				3503	* possibly compacted offset.
				3504	*/
				3505	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
				3506	while (valid) {
				3507	u64 feature = valid & -valid;
				3508	int index = fls64(feature) - 1;
				3509	void *dest = get_xsave_addr(xsave, feature);
				3510
				3511	if (dest) {
				3512	u32 size, offset, ecx, edx;
				3513	cpuid_count(XSTATE_CPUID, index,
				3514	&size, &offset, &ecx, &edx);
				3515	if (feature == XFEATURE_MASK_PKRU)
				3516	memcpy(&vcpu->arch.pkru, src + offset,
				3517	sizeof(vcpu->arch.pkru));
				3518	else
				3519	memcpy(dest, src + offset, size);
				3520	}
				3521
				3522	valid -= feature;
				3523	}
				3524	}
				3525
				3526	static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
				3527	struct kvm_xsave *guest_xsave)
				3528	{
				3529	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
				3530	memset(guest_xsave, 0, sizeof(struct kvm_xsave));
				3531	fill_xsave((u8 *) guest_xsave->region, vcpu);
				3532	} else {
				3533	memcpy(guest_xsave->region,
				3534	&vcpu->arch.guest_fpu.state.fxsave,
				3535	sizeof(struct fxregs_state));
				3536	(u64 )&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
				3537	XFEATURE_MASK_FPSSE;
				3538	}
				3539	}
				3540
				3541	#define XSAVE_MXCSR_OFFSET 24
				3542
				3543	static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
				3544	struct kvm_xsave *guest_xsave)
				3545	{
				3546	u64 xstate_bv =
				3547	(u64 )&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
				3548	u32 mxcsr = (u32 )&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
				3549
				3550	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
				3551	/*
				3552	* Here we allow setting states that are not present in
				3553	* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
				3554	* with old userspace.
				3555	*/
				3556	if (xstate_bv & ~kvm_supported_xcr0() \|\|
				3557	mxcsr & ~mxcsr_feature_mask)
				3558	return -EINVAL;
				3559	load_xsave(vcpu, (u8 *)guest_xsave->region);
				3560	} else {
				3561	if (xstate_bv & ~XFEATURE_MASK_FPSSE \|\|
				3562	mxcsr & ~mxcsr_feature_mask)
				3563	return -EINVAL;
				3564	memcpy(&vcpu->arch.guest_fpu.state.fxsave,
				3565	guest_xsave->region, sizeof(struct fxregs_state));
				3566	}
				3567	return 0;
				3568	}
				3569
				3570	static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
				3571	struct kvm_xcrs *guest_xcrs)
				3572	{
				3573	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
				3574	guest_xcrs->nr_xcrs = 0;
				3575	return;
				3576	}
				3577
				3578	guest_xcrs->nr_xcrs = 1;
				3579	guest_xcrs->flags = 0;
				3580	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
				3581	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
				3582	}
				3583
				3584	static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
				3585	struct kvm_xcrs *guest_xcrs)
				3586	{
				3587	int i, r = 0;
				3588
				3589	if (!boot_cpu_has(X86_FEATURE_XSAVE))
				3590	return -EINVAL;
				3591
				3592	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS \|\| guest_xcrs->flags)
				3593	return -EINVAL;
				3594
				3595	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
				3596	/* Only support XCR0 currently */
				3597	if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
				3598	r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
				3599	guest_xcrs->xcrs[i].value);
				3600	break;
				3601	}
				3602	if (r)
				3603	r = -EINVAL;
				3604	return r;
				3605	}
				3606
				3607	/*
				3608	* kvm_set_guest_paused() indicates to the guest kernel that it has been
				3609	* stopped by the hypervisor. This function will be called from the host only.
				3610	* EINVAL is returned when the host attempts to set the flag for a guest that
				3611	* does not support pv clocks.
				3612	*/
				3613	static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
				3614	{
				3615	if (!vcpu->arch.pv_time_enabled)
				3616	return -EINVAL;
				3617	vcpu->arch.pvclock_set_guest_stopped_request = true;
				3618	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				3619	return 0;
				3620	}
				3621
				3622	static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
				3623	struct kvm_enable_cap *cap)
				3624	{
				3625	if (cap->flags)
				3626	return -EINVAL;
				3627
				3628	switch (cap->cap) {
				3629	case KVM_CAP_HYPERV_SYNIC2:
				3630	if (cap->args[0])
				3631	return -EINVAL;
				3632	case KVM_CAP_HYPERV_SYNIC:
				3633	if (!irqchip_in_kernel(vcpu->kvm))
				3634	return -EINVAL;
				3635	return kvm_hv_activate_synic(vcpu, cap->cap ==
				3636	KVM_CAP_HYPERV_SYNIC2);
				3637	default:
				3638	return -EINVAL;
				3639	}
				3640	}
				3641
				3642	long kvm_arch_vcpu_ioctl(struct file *filp,
				3643	unsigned int ioctl, unsigned long arg)
				3644	{
				3645	struct kvm_vcpu *vcpu = filp->private_data;
				3646	void __user argp = (void __user )arg;
				3647	int r;
				3648	union {
				3649	struct kvm_lapic_state *lapic;
				3650	struct kvm_xsave *xsave;
				3651	struct kvm_xcrs *xcrs;
				3652	void *buffer;
				3653	} u;
				3654
				3655	u.buffer = NULL;
				3656	switch (ioctl) {
				3657	case KVM_GET_LAPIC: {
				3658	r = -EINVAL;
				3659	if (!lapic_in_kernel(vcpu))
				3660	goto out;
				3661	u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
				3662
				3663	r = -ENOMEM;
				3664	if (!u.lapic)
				3665	goto out;
				3666	r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
				3667	if (r)
				3668	goto out;
				3669	r = -EFAULT;
				3670	if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
				3671	goto out;
				3672	r = 0;
				3673	break;
				3674	}
				3675	case KVM_SET_LAPIC: {
				3676	r = -EINVAL;
				3677	if (!lapic_in_kernel(vcpu))
				3678	goto out;
				3679	u.lapic = memdup_user(argp, sizeof(*u.lapic));
				3680	if (IS_ERR(u.lapic))
				3681	return PTR_ERR(u.lapic);
				3682
				3683	r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
				3684	break;
				3685	}
				3686	case KVM_INTERRUPT: {
				3687	struct kvm_interrupt irq;
				3688
				3689	r = -EFAULT;
				3690	if (copy_from_user(&irq, argp, sizeof irq))
				3691	goto out;
				3692	r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
				3693	break;
				3694	}
				3695	case KVM_NMI: {
				3696	r = kvm_vcpu_ioctl_nmi(vcpu);
				3697	break;
				3698	}
				3699	case KVM_SMI: {
				3700	r = kvm_vcpu_ioctl_smi(vcpu);
				3701	break;
				3702	}
				3703	case KVM_SET_CPUID: {
				3704	struct kvm_cpuid __user *cpuid_arg = argp;
				3705	struct kvm_cpuid cpuid;
				3706
				3707	r = -EFAULT;
				3708	if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
				3709	goto out;
				3710	r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
				3711	break;
				3712	}
				3713	case KVM_SET_CPUID2: {
				3714	struct kvm_cpuid2 __user *cpuid_arg = argp;
				3715	struct kvm_cpuid2 cpuid;
				3716
				3717	r = -EFAULT;
				3718	if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
				3719	goto out;
				3720	r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
				3721	cpuid_arg->entries);
				3722	break;
				3723	}
				3724	case KVM_GET_CPUID2: {
				3725	struct kvm_cpuid2 __user *cpuid_arg = argp;
				3726	struct kvm_cpuid2 cpuid;
				3727
				3728	r = -EFAULT;
				3729	if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
				3730	goto out;
				3731	r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
				3732	cpuid_arg->entries);
				3733	if (r)
				3734	goto out;
				3735	r = -EFAULT;
				3736	if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
				3737	goto out;
				3738	r = 0;
				3739	break;
				3740	}
				3741	case KVM_GET_MSRS: {
				3742	int idx = srcu_read_lock(&vcpu->kvm->srcu);
				3743	r = msr_io(vcpu, argp, do_get_msr, 1);
				3744	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				3745	break;
				3746	}
				3747	case KVM_SET_MSRS: {
				3748	int idx = srcu_read_lock(&vcpu->kvm->srcu);
				3749	r = msr_io(vcpu, argp, do_set_msr, 0);
				3750	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				3751	break;
				3752	}
				3753	case KVM_TPR_ACCESS_REPORTING: {
				3754	struct kvm_tpr_access_ctl tac;
				3755
				3756	r = -EFAULT;
				3757	if (copy_from_user(&tac, argp, sizeof tac))
				3758	goto out;
				3759	r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
				3760	if (r)
				3761	goto out;
				3762	r = -EFAULT;
				3763	if (copy_to_user(argp, &tac, sizeof tac))
				3764	goto out;
				3765	r = 0;
				3766	break;
				3767	};
				3768	case KVM_SET_VAPIC_ADDR: {
				3769	struct kvm_vapic_addr va;
				3770	int idx;
				3771
				3772	r = -EINVAL;
				3773	if (!lapic_in_kernel(vcpu))
				3774	goto out;
				3775	r = -EFAULT;
				3776	if (copy_from_user(&va, argp, sizeof va))
				3777	goto out;
				3778	idx = srcu_read_lock(&vcpu->kvm->srcu);
				3779	r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
				3780	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				3781	break;
				3782	}
				3783	case KVM_X86_SETUP_MCE: {
				3784	u64 mcg_cap;
				3785
				3786	r = -EFAULT;
				3787	if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
				3788	goto out;
				3789	r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
				3790	break;
				3791	}
				3792	case KVM_X86_SET_MCE: {
				3793	struct kvm_x86_mce mce;
				3794
				3795	r = -EFAULT;
				3796	if (copy_from_user(&mce, argp, sizeof mce))
				3797	goto out;
				3798	r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
				3799	break;
				3800	}
				3801	case KVM_GET_VCPU_EVENTS: {
				3802	struct kvm_vcpu_events events;
				3803
				3804	kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
				3805
				3806	r = -EFAULT;
				3807	if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
				3808	break;
				3809	r = 0;
				3810	break;
				3811	}
				3812	case KVM_SET_VCPU_EVENTS: {
				3813	struct kvm_vcpu_events events;
				3814
				3815	r = -EFAULT;
				3816	if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
				3817	break;
				3818
				3819	r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
				3820	break;
				3821	}
				3822	case KVM_GET_DEBUGREGS: {
				3823	struct kvm_debugregs dbgregs;
				3824
				3825	kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
				3826
				3827	r = -EFAULT;
				3828	if (copy_to_user(argp, &dbgregs,
				3829	sizeof(struct kvm_debugregs)))
				3830	break;
				3831	r = 0;
				3832	break;
				3833	}
				3834	case KVM_SET_DEBUGREGS: {
				3835	struct kvm_debugregs dbgregs;
				3836
				3837	r = -EFAULT;
				3838	if (copy_from_user(&dbgregs, argp,
				3839	sizeof(struct kvm_debugregs)))
				3840	break;
				3841
				3842	r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
				3843	break;
				3844	}
				3845	case KVM_GET_XSAVE: {
				3846	u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
				3847	r = -ENOMEM;
				3848	if (!u.xsave)
				3849	break;
				3850
				3851	kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
				3852
				3853	r = -EFAULT;
				3854	if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
				3855	break;
				3856	r = 0;
				3857	break;
				3858	}
				3859	case KVM_SET_XSAVE: {
				3860	u.xsave = memdup_user(argp, sizeof(*u.xsave));
				3861	if (IS_ERR(u.xsave))
				3862	return PTR_ERR(u.xsave);
				3863
				3864	r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
				3865	break;
				3866	}
				3867	case KVM_GET_XCRS: {
				3868	u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
				3869	r = -ENOMEM;
				3870	if (!u.xcrs)
				3871	break;
				3872
				3873	kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
				3874
				3875	r = -EFAULT;
				3876	if (copy_to_user(argp, u.xcrs,
				3877	sizeof(struct kvm_xcrs)))
				3878	break;
				3879	r = 0;
				3880	break;
				3881	}
				3882	case KVM_SET_XCRS: {
				3883	u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
				3884	if (IS_ERR(u.xcrs))
				3885	return PTR_ERR(u.xcrs);
				3886
				3887	r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
				3888	break;
				3889	}
				3890	case KVM_SET_TSC_KHZ: {
				3891	u32 user_tsc_khz;
				3892
				3893	r = -EINVAL;
				3894	user_tsc_khz = (u32)arg;
				3895
				3896	if (user_tsc_khz >= kvm_max_guest_tsc_khz)
				3897	goto out;
				3898
				3899	if (user_tsc_khz == 0)
				3900	user_tsc_khz = tsc_khz;
				3901
				3902	if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
				3903	r = 0;
				3904
				3905	goto out;
				3906	}
				3907	case KVM_GET_TSC_KHZ: {
				3908	r = vcpu->arch.virtual_tsc_khz;
				3909	goto out;
				3910	}
				3911	case KVM_KVMCLOCK_CTRL: {
				3912	r = kvm_set_guest_paused(vcpu);
				3913	goto out;
				3914	}
				3915	case KVM_ENABLE_CAP: {
				3916	struct kvm_enable_cap cap;
				3917
				3918	r = -EFAULT;
				3919	if (copy_from_user(&cap, argp, sizeof(cap)))
				3920	goto out;
				3921	r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
				3922	break;
				3923	}
				3924	default:
				3925	r = -EINVAL;
				3926	}
				3927	out:
				3928	kfree(u.buffer);
				3929	return r;
				3930	}
				3931
				3932	int kvm_arch_vcpu_fault(struct kvm_vcpu vcpu, struct vm_fault vmf)
				3933	{
				3934	return VM_FAULT_SIGBUS;
				3935	}
				3936
				3937	static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
				3938	{
				3939	int ret;
				3940
				3941	if (addr > (unsigned int)(-3 * PAGE_SIZE))
				3942	return -EINVAL;
				3943	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
				3944	return ret;
				3945	}
				3946
				3947	static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
				3948	u64 ident_addr)
				3949	{
				3950	kvm->arch.ept_identity_map_addr = ident_addr;
				3951	return 0;
				3952	}
				3953
				3954	static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
				3955	u32 kvm_nr_mmu_pages)
				3956	{
				3957	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
				3958	return -EINVAL;
				3959
				3960	mutex_lock(&kvm->slots_lock);
				3961
				3962	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
				3963	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
				3964
				3965	mutex_unlock(&kvm->slots_lock);
				3966	return 0;
				3967	}
				3968
				3969	static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
				3970	{
				3971	return kvm->arch.n_max_mmu_pages;
				3972	}
				3973
				3974	static int kvm_vm_ioctl_get_irqchip(struct kvm kvm, struct kvm_irqchip chip)
				3975	{
				3976	struct kvm_pic *pic = kvm->arch.vpic;
				3977	int r;
				3978
				3979	r = 0;
				3980	switch (chip->chip_id) {
				3981	case KVM_IRQCHIP_PIC_MASTER:
				3982	memcpy(&chip->chip.pic, &pic->pics[0],
				3983	sizeof(struct kvm_pic_state));
				3984	break;
				3985	case KVM_IRQCHIP_PIC_SLAVE:
				3986	memcpy(&chip->chip.pic, &pic->pics[1],
				3987	sizeof(struct kvm_pic_state));
				3988	break;
				3989	case KVM_IRQCHIP_IOAPIC:
				3990	kvm_get_ioapic(kvm, &chip->chip.ioapic);
				3991	break;
				3992	default:
				3993	r = -EINVAL;
				3994	break;
				3995	}
				3996	return r;
				3997	}
				3998
				3999	static int kvm_vm_ioctl_set_irqchip(struct kvm kvm, struct kvm_irqchip chip)
				4000	{
				4001	struct kvm_pic *pic = kvm->arch.vpic;
				4002	int r;
				4003
				4004	r = 0;
				4005	switch (chip->chip_id) {
				4006	case KVM_IRQCHIP_PIC_MASTER:
				4007	spin_lock(&pic->lock);
				4008	memcpy(&pic->pics[0], &chip->chip.pic,
				4009	sizeof(struct kvm_pic_state));
				4010	spin_unlock(&pic->lock);
				4011	break;
				4012	case KVM_IRQCHIP_PIC_SLAVE:
				4013	spin_lock(&pic->lock);
				4014	memcpy(&pic->pics[1], &chip->chip.pic,
				4015	sizeof(struct kvm_pic_state));
				4016	spin_unlock(&pic->lock);
				4017	break;
				4018	case KVM_IRQCHIP_IOAPIC:
				4019	kvm_set_ioapic(kvm, &chip->chip.ioapic);
				4020	break;
				4021	default:
				4022	r = -EINVAL;
				4023	break;
				4024	}
				4025	kvm_pic_update_irq(pic);
				4026	return r;
				4027	}
				4028
				4029	static int kvm_vm_ioctl_get_pit(struct kvm kvm, struct kvm_pit_state ps)
				4030	{
				4031	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
				4032
				4033	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
				4034
				4035	mutex_lock(&kps->lock);
				4036	memcpy(ps, &kps->channels, sizeof(*ps));
				4037	mutex_unlock(&kps->lock);
				4038	return 0;
				4039	}
				4040
				4041	static int kvm_vm_ioctl_set_pit(struct kvm kvm, struct kvm_pit_state ps)
				4042	{
				4043	int i;
				4044	struct kvm_pit *pit = kvm->arch.vpit;
				4045
				4046	mutex_lock(&pit->pit_state.lock);
				4047	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
				4048	for (i = 0; i < 3; i++)
				4049	kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
				4050	mutex_unlock(&pit->pit_state.lock);
				4051	return 0;
				4052	}
				4053
				4054	static int kvm_vm_ioctl_get_pit2(struct kvm kvm, struct kvm_pit_state2 ps)
				4055	{
				4056	mutex_lock(&kvm->arch.vpit->pit_state.lock);
				4057	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
				4058	sizeof(ps->channels));
				4059	ps->flags = kvm->arch.vpit->pit_state.flags;
				4060	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
				4061	memset(&ps->reserved, 0, sizeof(ps->reserved));
				4062	return 0;
				4063	}
				4064
				4065	static int kvm_vm_ioctl_set_pit2(struct kvm kvm, struct kvm_pit_state2 ps)
				4066	{
				4067	int start = 0;
				4068	int i;
				4069	u32 prev_legacy, cur_legacy;
				4070	struct kvm_pit *pit = kvm->arch.vpit;
				4071
				4072	mutex_lock(&pit->pit_state.lock);
				4073	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
				4074	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
				4075	if (!prev_legacy && cur_legacy)
				4076	start = 1;
				4077	memcpy(&pit->pit_state.channels, &ps->channels,
				4078	sizeof(pit->pit_state.channels));
				4079	pit->pit_state.flags = ps->flags;
				4080	for (i = 0; i < 3; i++)
				4081	kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
				4082	start && i == 0);
				4083	mutex_unlock(&pit->pit_state.lock);
				4084	return 0;
				4085	}
				4086
				4087	static int kvm_vm_ioctl_reinject(struct kvm *kvm,
				4088	struct kvm_reinject_control *control)
				4089	{
				4090	struct kvm_pit *pit = kvm->arch.vpit;
				4091
				4092	if (!pit)
				4093	return -ENXIO;
				4094
				4095	/* pit->pit_state.lock was overloaded to prevent userspace from getting
				4096	* an inconsistent state after running multiple KVM_REINJECT_CONTROL
				4097	* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
				4098	*/
				4099	mutex_lock(&pit->pit_state.lock);
				4100	kvm_pit_set_reinject(pit, control->pit_reinject);
				4101	mutex_unlock(&pit->pit_state.lock);
				4102
				4103	return 0;
				4104	}
				4105
				4106	/**
				4107	* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
				4108	* @kvm: kvm instance
				4109	* @log: slot id and address to which we copy the log
				4110	*
				4111	* Steps 1-4 below provide general overview of dirty page logging. See
				4112	* kvm_get_dirty_log_protect() function description for additional details.
				4113	*
				4114	* We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
				4115	* always flush the TLB (step 4) even if previous step failed and the dirty
				4116	* bitmap may be corrupt. Regardless of previous outcome the KVM logging API
				4117	* does not preclude user space subsequent dirty log read. Flushing TLB ensures
				4118	* writes will be marked dirty for next log read.
				4119	*
				4120	* 1. Take a snapshot of the bit and clear it if needed.
				4121	* 2. Write protect the corresponding page.
				4122	* 3. Copy the snapshot to the userspace.
				4123	* 4. Flush TLB's if needed.
				4124	*/
				4125	int kvm_vm_ioctl_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log)
				4126	{
				4127	bool is_dirty = false;
				4128	int r;
				4129
				4130	mutex_lock(&kvm->slots_lock);
				4131
				4132	/*
				4133	* Flush potentially hardware-cached dirty pages to dirty_bitmap.
				4134	*/
				4135	if (kvm_x86_ops->flush_log_dirty)
				4136	kvm_x86_ops->flush_log_dirty(kvm);
				4137
				4138	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
				4139
				4140	/*
				4141	* All the TLBs can be flushed out of mmu lock, see the comments in
				4142	* kvm_mmu_slot_remove_write_access().
				4143	*/
				4144	lockdep_assert_held(&kvm->slots_lock);
				4145	if (is_dirty)
				4146	kvm_flush_remote_tlbs(kvm);
				4147
				4148	mutex_unlock(&kvm->slots_lock);
				4149	return r;
				4150	}
				4151
				4152	int kvm_vm_ioctl_irq_line(struct kvm kvm, struct kvm_irq_level irq_event,
				4153	bool line_status)
				4154	{
				4155	if (!irqchip_in_kernel(kvm))
				4156	return -ENXIO;
				4157
				4158	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
				4159	irq_event->irq, irq_event->level,
				4160	line_status);
				4161	return 0;
				4162	}
				4163
				4164	static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
				4165	struct kvm_enable_cap *cap)
				4166	{
				4167	int r;
				4168
				4169	if (cap->flags)
				4170	return -EINVAL;
				4171
				4172	switch (cap->cap) {
				4173	case KVM_CAP_DISABLE_QUIRKS:
				4174	kvm->arch.disabled_quirks = cap->args[0];
				4175	r = 0;
				4176	break;
				4177	case KVM_CAP_SPLIT_IRQCHIP: {
				4178	mutex_lock(&kvm->lock);
				4179	r = -EINVAL;
				4180	if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
				4181	goto split_irqchip_unlock;
				4182	r = -EEXIST;
				4183	if (irqchip_in_kernel(kvm))
				4184	goto split_irqchip_unlock;
				4185	if (kvm->created_vcpus)
				4186	goto split_irqchip_unlock;
				4187	r = kvm_setup_empty_irq_routing(kvm);
				4188	if (r)
				4189	goto split_irqchip_unlock;
				4190	/* Pairs with irqchip_in_kernel. */
				4191	smp_wmb();
				4192	kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
				4193	kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
				4194	r = 0;
				4195	split_irqchip_unlock:
				4196	mutex_unlock(&kvm->lock);
				4197	break;
				4198	}
				4199	case KVM_CAP_X2APIC_API:
				4200	r = -EINVAL;
				4201	if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
				4202	break;
				4203
				4204	if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
				4205	kvm->arch.x2apic_format = true;
				4206	if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
				4207	kvm->arch.x2apic_broadcast_quirk_disabled = true;
				4208
				4209	r = 0;
				4210	break;
				4211	default:
				4212	r = -EINVAL;
				4213	break;
				4214	}
				4215	return r;
				4216	}
				4217
				4218	long kvm_arch_vm_ioctl(struct file *filp,
				4219	unsigned int ioctl, unsigned long arg)
				4220	{
				4221	struct kvm *kvm = filp->private_data;
				4222	void __user argp = (void __user )arg;
				4223	int r = -ENOTTY;
				4224	/*
				4225	* This union makes it completely explicit to gcc-3.x
				4226	* that these two variables' stack usage should be
				4227	* combined, not added together.
				4228	*/
				4229	union {
				4230	struct kvm_pit_state ps;
				4231	struct kvm_pit_state2 ps2;
				4232	struct kvm_pit_config pit_config;
				4233	} u;
				4234
				4235	switch (ioctl) {
				4236	case KVM_SET_TSS_ADDR:
				4237	r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
				4238	break;
				4239	case KVM_SET_IDENTITY_MAP_ADDR: {
				4240	u64 ident_addr;
				4241
				4242	r = -EFAULT;
				4243	if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
				4244	goto out;
				4245	r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
				4246	break;
				4247	}
				4248	case KVM_SET_NR_MMU_PAGES:
				4249	r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
				4250	break;
				4251	case KVM_GET_NR_MMU_PAGES:
				4252	r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
				4253	break;
				4254	case KVM_CREATE_IRQCHIP: {
				4255	mutex_lock(&kvm->lock);
				4256
				4257	r = -EEXIST;
				4258	if (irqchip_in_kernel(kvm))
				4259	goto create_irqchip_unlock;
				4260
				4261	r = -EINVAL;
				4262	if (kvm->created_vcpus)
				4263	goto create_irqchip_unlock;
				4264
				4265	r = kvm_pic_init(kvm);
				4266	if (r)
				4267	goto create_irqchip_unlock;
				4268
				4269	r = kvm_ioapic_init(kvm);
				4270	if (r) {
				4271	kvm_pic_destroy(kvm);
				4272	goto create_irqchip_unlock;
				4273	}
				4274
				4275	r = kvm_setup_default_irq_routing(kvm);
				4276	if (r) {
				4277	kvm_ioapic_destroy(kvm);
				4278	kvm_pic_destroy(kvm);
				4279	goto create_irqchip_unlock;
				4280	}
				4281	/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
				4282	smp_wmb();
				4283	kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
				4284	create_irqchip_unlock:
				4285	mutex_unlock(&kvm->lock);
				4286	break;
				4287	}
				4288	case KVM_CREATE_PIT:
				4289	u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
				4290	goto create_pit;
				4291	case KVM_CREATE_PIT2:
				4292	r = -EFAULT;
				4293	if (copy_from_user(&u.pit_config, argp,
				4294	sizeof(struct kvm_pit_config)))
				4295	goto out;
				4296	create_pit:
				4297	mutex_lock(&kvm->lock);
				4298	r = -EEXIST;
				4299	if (kvm->arch.vpit)
				4300	goto create_pit_unlock;
				4301	r = -ENOMEM;
				4302	kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
				4303	if (kvm->arch.vpit)
				4304	r = 0;
				4305	create_pit_unlock:
				4306	mutex_unlock(&kvm->lock);
				4307	break;
				4308	case KVM_GET_IRQCHIP: {
				4309	/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
				4310	struct kvm_irqchip *chip;
				4311
				4312	chip = memdup_user(argp, sizeof(*chip));
				4313	if (IS_ERR(chip)) {
				4314	r = PTR_ERR(chip);
				4315	goto out;
				4316	}
				4317
				4318	r = -ENXIO;
				4319	if (!irqchip_kernel(kvm))
				4320	goto get_irqchip_out;
				4321	r = kvm_vm_ioctl_get_irqchip(kvm, chip);
				4322	if (r)
				4323	goto get_irqchip_out;
				4324	r = -EFAULT;
				4325	if (copy_to_user(argp, chip, sizeof *chip))
				4326	goto get_irqchip_out;
				4327	r = 0;
				4328	get_irqchip_out:
				4329	kfree(chip);
				4330	break;
				4331	}
				4332	case KVM_SET_IRQCHIP: {
				4333	/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
				4334	struct kvm_irqchip *chip;
				4335
				4336	chip = memdup_user(argp, sizeof(*chip));
				4337	if (IS_ERR(chip)) {
				4338	r = PTR_ERR(chip);
				4339	goto out;
				4340	}
				4341
				4342	r = -ENXIO;
				4343	if (!irqchip_kernel(kvm))
				4344	goto set_irqchip_out;
				4345	r = kvm_vm_ioctl_set_irqchip(kvm, chip);
				4346	if (r)
				4347	goto set_irqchip_out;
				4348	r = 0;
				4349	set_irqchip_out:
				4350	kfree(chip);
				4351	break;
				4352	}
				4353	case KVM_GET_PIT: {
				4354	r = -EFAULT;
				4355	if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
				4356	goto out;
				4357	r = -ENXIO;
				4358	if (!kvm->arch.vpit)
				4359	goto out;
				4360	r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
				4361	if (r)
				4362	goto out;
				4363	r = -EFAULT;
				4364	if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
				4365	goto out;
				4366	r = 0;
				4367	break;
				4368	}
				4369	case KVM_SET_PIT: {
				4370	r = -EFAULT;
				4371	if (copy_from_user(&u.ps, argp, sizeof u.ps))
				4372	goto out;
				4373	mutex_lock(&kvm->lock);
				4374	r = -ENXIO;
				4375	if (!kvm->arch.vpit)
				4376	goto set_pit_out;
				4377	r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
				4378	set_pit_out:
				4379	mutex_unlock(&kvm->lock);
				4380	break;
				4381	}
				4382	case KVM_GET_PIT2: {
				4383	r = -ENXIO;
				4384	if (!kvm->arch.vpit)
				4385	goto out;
				4386	r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
				4387	if (r)
				4388	goto out;
				4389	r = -EFAULT;
				4390	if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
				4391	goto out;
				4392	r = 0;
				4393	break;
				4394	}
				4395	case KVM_SET_PIT2: {
				4396	r = -EFAULT;
				4397	if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
				4398	goto out;
				4399	mutex_lock(&kvm->lock);
				4400	r = -ENXIO;
				4401	if (!kvm->arch.vpit)
				4402	goto set_pit2_out;
				4403	r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
				4404	set_pit2_out:
				4405	mutex_unlock(&kvm->lock);
				4406	break;
				4407	}
				4408	case KVM_REINJECT_CONTROL: {
				4409	struct kvm_reinject_control control;
				4410	r = -EFAULT;
				4411	if (copy_from_user(&control, argp, sizeof(control)))
				4412	goto out;
				4413	r = kvm_vm_ioctl_reinject(kvm, &control);
				4414	break;
				4415	}
				4416	case KVM_SET_BOOT_CPU_ID:
				4417	r = 0;
				4418	mutex_lock(&kvm->lock);
				4419	if (kvm->created_vcpus)
				4420	r = -EBUSY;
				4421	else
				4422	kvm->arch.bsp_vcpu_id = arg;
				4423	mutex_unlock(&kvm->lock);
				4424	break;
				4425	case KVM_XEN_HVM_CONFIG: {
				4426	struct kvm_xen_hvm_config xhc;
				4427	r = -EFAULT;
				4428	if (copy_from_user(&xhc, argp, sizeof(xhc)))
				4429	goto out;
				4430	r = -EINVAL;
				4431	if (xhc.flags)
				4432	goto out;
				4433	memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
				4434	r = 0;
				4435	break;
				4436	}
				4437	case KVM_SET_CLOCK: {
				4438	struct kvm_clock_data user_ns;
				4439	u64 now_ns;
				4440
				4441	r = -EFAULT;
				4442	if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
				4443	goto out;
				4444
				4445	r = -EINVAL;
				4446	if (user_ns.flags)
				4447	goto out;
				4448
				4449	r = 0;
				4450	/*
				4451	* TODO: userspace has to take care of races with VCPU_RUN, so
				4452	* kvm_gen_update_masterclock() can be cut down to locked
				4453	* pvclock_update_vm_gtod_copy().
				4454	*/
				4455	kvm_gen_update_masterclock(kvm);
				4456	now_ns = get_kvmclock_ns(kvm);
				4457	kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
				4458	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
				4459	break;
				4460	}
				4461	case KVM_GET_CLOCK: {
				4462	struct kvm_clock_data user_ns;
				4463	u64 now_ns;
				4464
				4465	now_ns = get_kvmclock_ns(kvm);
				4466	user_ns.clock = now_ns;
				4467	user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
				4468	memset(&user_ns.pad, 0, sizeof(user_ns.pad));
				4469
				4470	r = -EFAULT;
				4471	if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
				4472	goto out;
				4473	r = 0;
				4474	break;
				4475	}
				4476	case KVM_ENABLE_CAP: {
				4477	struct kvm_enable_cap cap;
				4478
				4479	r = -EFAULT;
				4480	if (copy_from_user(&cap, argp, sizeof(cap)))
				4481	goto out;
				4482	r = kvm_vm_ioctl_enable_cap(kvm, &cap);
				4483	break;
				4484	}
				4485	default:
				4486	r = -ENOTTY;
				4487	}
				4488	out:
				4489	return r;
				4490	}
				4491
				4492	static void kvm_init_msr_list(void)
				4493	{
				4494	u32 dummy[2];
				4495	unsigned i, j;
				4496
				4497	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
				4498	if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
				4499	continue;
				4500
				4501	/*
				4502	* Even MSRs that are valid in the host may not be exposed
				4503	* to the guests in some cases.
				4504	*/
				4505	switch (msrs_to_save[i]) {
				4506	case MSR_IA32_BNDCFGS:
				4507	if (!kvm_x86_ops->mpx_supported())
				4508	continue;
				4509	break;
				4510	case MSR_TSC_AUX:
				4511	if (!kvm_x86_ops->rdtscp_supported())
				4512	continue;
				4513	break;
				4514	default:
				4515	break;
				4516	}
				4517
				4518	if (j < i)
				4519	msrs_to_save[j] = msrs_to_save[i];
				4520	j++;
				4521	}
				4522	num_msrs_to_save = j;
				4523
				4524	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
				4525	if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
				4526	continue;
				4527
				4528	if (j < i)
				4529	emulated_msrs[j] = emulated_msrs[i];
				4530	j++;
				4531	}
				4532	num_emulated_msrs = j;
				4533
				4534	for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
				4535	struct kvm_msr_entry msr;
				4536
				4537	msr.index = msr_based_features[i];
				4538	if (kvm_get_msr_feature(&msr))
				4539	continue;
				4540
				4541	if (j < i)
				4542	msr_based_features[j] = msr_based_features[i];
				4543	j++;
				4544	}
				4545	num_msr_based_features = j;
				4546	}
				4547
				4548	static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
				4549	const void *v)
				4550	{
				4551	int handled = 0;
				4552	int n;
				4553
				4554	do {
				4555	n = min(len, 8);
				4556	if (!(lapic_in_kernel(vcpu) &&
				4557	!kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
				4558	&& kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
				4559	break;
				4560	handled += n;
				4561	addr += n;
				4562	len -= n;
				4563	v += n;
				4564	} while (len);
				4565
				4566	return handled;
				4567	}
				4568
				4569	static int vcpu_mmio_read(struct kvm_vcpu vcpu, gpa_t addr, int len, void v)
				4570	{
				4571	int handled = 0;
				4572	int n;
				4573
				4574	do {
				4575	n = min(len, 8);
				4576	if (!(lapic_in_kernel(vcpu) &&
				4577	!kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
				4578	addr, n, v))
				4579	&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
				4580	break;
				4581	trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
				4582	handled += n;
				4583	addr += n;
				4584	len -= n;
				4585	v += n;
				4586	} while (len);
				4587
				4588	return handled;
				4589	}
				4590
				4591	static void kvm_set_segment(struct kvm_vcpu *vcpu,
				4592	struct kvm_segment *var, int seg)
				4593	{
				4594	kvm_x86_ops->set_segment(vcpu, var, seg);
				4595	}
				4596
				4597	void kvm_get_segment(struct kvm_vcpu *vcpu,
				4598	struct kvm_segment *var, int seg)
				4599	{
				4600	kvm_x86_ops->get_segment(vcpu, var, seg);
				4601	}
				4602
				4603	gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
				4604	struct x86_exception *exception)
				4605	{
				4606	gpa_t t_gpa;
				4607
				4608	BUG_ON(!mmu_is_nested(vcpu));
				4609
				4610	/* NPT walks are always user-walks */
				4611	access \|= PFERR_USER_MASK;
				4612	t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
				4613
				4614	return t_gpa;
				4615	}
				4616
				4617	gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
				4618	struct x86_exception *exception)
				4619	{
				4620	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				4621	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				4622	}
				4623
				4624	gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
				4625	struct x86_exception *exception)
				4626	{
				4627	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				4628	access \|= PFERR_FETCH_MASK;
				4629	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				4630	}
				4631
				4632	gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
				4633	struct x86_exception *exception)
				4634	{
				4635	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				4636	access \|= PFERR_WRITE_MASK;
				4637	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				4638	}
				4639
				4640	/* uses this to access any guest's mapped memory without checking CPL */
				4641	gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
				4642	struct x86_exception *exception)
				4643	{
				4644	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
				4645	}
				4646
				4647	static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
				4648	struct kvm_vcpu *vcpu, u32 access,
				4649	struct x86_exception *exception)
				4650	{
				4651	void *data = val;
				4652	int r = X86EMUL_CONTINUE;
				4653
				4654	while (bytes) {
				4655	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
				4656	exception);
				4657	unsigned offset = addr & (PAGE_SIZE-1);
				4658	unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
				4659	int ret;
				4660
				4661	if (gpa == UNMAPPED_GVA)
				4662	return X86EMUL_PROPAGATE_FAULT;
				4663	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
				4664	offset, toread);
				4665	if (ret < 0) {
				4666	r = X86EMUL_IO_NEEDED;
				4667	goto out;
				4668	}
				4669
				4670	bytes -= toread;
				4671	data += toread;
				4672	addr += toread;
				4673	}
				4674	out:
				4675	return r;
				4676	}
				4677
				4678	/* used for instruction fetching */
				4679	static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
				4680	gva_t addr, void *val, unsigned int bytes,
				4681	struct x86_exception *exception)
				4682	{
				4683	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				4684	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				4685	unsigned offset;
				4686	int ret;
				4687
				4688	/* Inline kvm_read_guest_virt_helper for speed. */
				4689	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access\|PFERR_FETCH_MASK,
				4690	exception);
				4691	if (unlikely(gpa == UNMAPPED_GVA))
				4692	return X86EMUL_PROPAGATE_FAULT;
				4693
				4694	offset = addr & (PAGE_SIZE-1);
				4695	if (WARN_ON(offset + bytes > PAGE_SIZE))
				4696	bytes = (unsigned)PAGE_SIZE - offset;
				4697	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
				4698	offset, bytes);
				4699	if (unlikely(ret < 0))
				4700	return X86EMUL_IO_NEEDED;
				4701
				4702	return X86EMUL_CONTINUE;
				4703	}
				4704
				4705	int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
				4706	gva_t addr, void *val, unsigned int bytes,
				4707	struct x86_exception *exception)
				4708	{
				4709	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				4710
				4711	/*
				4712	* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
				4713	* is returned, but our callers are not ready for that and they blindly
				4714	* call kvm_inject_page_fault. Ensure that they at least do not leak
				4715	* uninitialized kernel stack memory into cr2 and error code.
				4716	*/
				4717	memset(exception, 0, sizeof(*exception));
				4718	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
				4719	exception);
				4720	}
				4721	EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
				4722
				4723	static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
				4724	gva_t addr, void *val, unsigned int bytes,
				4725	struct x86_exception *exception, bool system)
				4726	{
				4727	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				4728	u32 access = 0;
				4729
				4730	if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
				4731	access \|= PFERR_USER_MASK;
				4732
				4733	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
				4734	}
				4735
				4736	static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
				4737	unsigned long addr, void *val, unsigned int bytes)
				4738	{
				4739	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				4740	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
				4741
				4742	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
				4743	}
				4744
				4745	static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
				4746	struct kvm_vcpu *vcpu, u32 access,
				4747	struct x86_exception *exception)
				4748	{
				4749	void *data = val;
				4750	int r = X86EMUL_CONTINUE;
				4751
				4752	while (bytes) {
				4753	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
				4754	access,
				4755	exception);
				4756	unsigned offset = addr & (PAGE_SIZE-1);
				4757	unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
				4758	int ret;
				4759
				4760	if (gpa == UNMAPPED_GVA)
				4761	return X86EMUL_PROPAGATE_FAULT;
				4762	ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
				4763	if (ret < 0) {
				4764	r = X86EMUL_IO_NEEDED;
				4765	goto out;
				4766	}
				4767
				4768	bytes -= towrite;
				4769	data += towrite;
				4770	addr += towrite;
				4771	}
				4772	out:
				4773	return r;
				4774	}
				4775
				4776	static int emulator_write_std(struct x86_emulate_ctxt ctxt, gva_t addr, void val,
				4777	unsigned int bytes, struct x86_exception *exception,
				4778	bool system)
				4779	{
				4780	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				4781	u32 access = PFERR_WRITE_MASK;
				4782
				4783	if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
				4784	access \|= PFERR_USER_MASK;
				4785
				4786	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
				4787	access, exception);
				4788	}
				4789
				4790	int kvm_write_guest_virt_system(struct kvm_vcpu vcpu, gva_t addr, void val,
				4791	unsigned int bytes, struct x86_exception *exception)
				4792	{
				4793	/* kvm_write_guest_virt_system can pull in tons of pages. */
				4794	vcpu->arch.l1tf_flush_l1d = true;
				4795
				4796	/*
				4797	* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
				4798	* is returned, but our callers are not ready for that and they blindly
				4799	* call kvm_inject_page_fault. Ensure that they at least do not leak
				4800	* uninitialized kernel stack memory into cr2 and error code.
				4801	*/
				4802	memset(exception, 0, sizeof(*exception));
				4803	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
				4804	PFERR_WRITE_MASK, exception);
				4805	}
				4806	EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
				4807
				4808	static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
				4809	gpa_t gpa, bool write)
				4810	{
				4811	/* For APIC access vmexit */
				4812	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
				4813	return 1;
				4814
				4815	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
				4816	trace_vcpu_match_mmio(gva, gpa, write, true);
				4817	return 1;
				4818	}
				4819
				4820	return 0;
				4821	}
				4822
				4823	static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
				4824	gpa_t gpa, struct x86_exception exception,
				4825	bool write)
				4826	{
				4827	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
				4828	\| (write ? PFERR_WRITE_MASK : 0);
				4829
				4830	/*
				4831	* currently PKRU is only applied to ept enabled guest so
				4832	* there is no pkey in EPT page table for L1 guest or EPT
				4833	* shadow page table for L2 guest.
				4834	*/
				4835	if (vcpu_match_mmio_gva(vcpu, gva)
				4836	&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
				4837	vcpu->arch.access, 0, access)) {
				4838	*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT \|
				4839	(gva & (PAGE_SIZE - 1));
				4840	trace_vcpu_match_mmio(gva, *gpa, write, false);
				4841	return 1;
				4842	}
				4843
				4844	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				4845
				4846	if (*gpa == UNMAPPED_GVA)
				4847	return -1;
				4848
				4849	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
				4850	}
				4851
				4852	int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
				4853	const void *val, int bytes)
				4854	{
				4855	int ret;
				4856
				4857	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
				4858	if (ret < 0)
				4859	return 0;
				4860	kvm_page_track_write(vcpu, gpa, val, bytes);
				4861	return 1;
				4862	}
				4863
				4864	struct read_write_emulator_ops {
				4865	int (read_write_prepare)(struct kvm_vcpu vcpu, void *val,
				4866	int bytes);
				4867	int (read_write_emulate)(struct kvm_vcpu vcpu, gpa_t gpa,
				4868	void *val, int bytes);
				4869	int (read_write_mmio)(struct kvm_vcpu vcpu, gpa_t gpa,
				4870	int bytes, void *val);
				4871	int (read_write_exit_mmio)(struct kvm_vcpu vcpu, gpa_t gpa,
				4872	void *val, int bytes);
				4873	bool write;
				4874	};
				4875
				4876	static int read_prepare(struct kvm_vcpu vcpu, void val, int bytes)
				4877	{
				4878	if (vcpu->mmio_read_completed) {
				4879	trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
				4880	vcpu->mmio_fragments[0].gpa, val);
				4881	vcpu->mmio_read_completed = 0;
				4882	return 1;
				4883	}
				4884
				4885	return 0;
				4886	}
				4887
				4888	static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
				4889	void *val, int bytes)
				4890	{
				4891	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
				4892	}
				4893
				4894	static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
				4895	void *val, int bytes)
				4896	{
				4897	return emulator_write_phys(vcpu, gpa, val, bytes);
				4898	}
				4899
				4900	static int write_mmio(struct kvm_vcpu vcpu, gpa_t gpa, int bytes, void val)
				4901	{
				4902	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
				4903	return vcpu_mmio_write(vcpu, gpa, bytes, val);
				4904	}
				4905
				4906	static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
				4907	void *val, int bytes)
				4908	{
				4909	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
				4910	return X86EMUL_IO_NEEDED;
				4911	}
				4912
				4913	static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
				4914	void *val, int bytes)
				4915	{
				4916	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
				4917
				4918	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
				4919	return X86EMUL_CONTINUE;
				4920	}
				4921
				4922	static const struct read_write_emulator_ops read_emultor = {
				4923	.read_write_prepare = read_prepare,
				4924	.read_write_emulate = read_emulate,
				4925	.read_write_mmio = vcpu_mmio_read,
				4926	.read_write_exit_mmio = read_exit_mmio,
				4927	};
				4928
				4929	static const struct read_write_emulator_ops write_emultor = {
				4930	.read_write_emulate = write_emulate,
				4931	.read_write_mmio = write_mmio,
				4932	.read_write_exit_mmio = write_exit_mmio,
				4933	.write = true,
				4934	};
				4935
				4936	static int emulator_read_write_onepage(unsigned long addr, void *val,
				4937	unsigned int bytes,
				4938	struct x86_exception *exception,
				4939	struct kvm_vcpu *vcpu,
				4940	const struct read_write_emulator_ops *ops)
				4941	{
				4942	gpa_t gpa;
				4943	int handled, ret;
				4944	bool write = ops->write;
				4945	struct kvm_mmio_fragment *frag;
				4946	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				4947
				4948	/*
				4949	* If the exit was due to a NPF we may already have a GPA.
				4950	* If the GPA is present, use it to avoid the GVA to GPA table walk.
				4951	* Note, this cannot be used on string operations since string
				4952	* operation using rep will only have the initial GPA from the NPF
				4953	* occurred.
				4954	*/
				4955	if (vcpu->arch.gpa_available &&
				4956	emulator_can_use_gpa(ctxt) &&
				4957	(addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
				4958	gpa = vcpu->arch.gpa_val;
				4959	ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
				4960	} else {
				4961	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
				4962	if (ret < 0)
				4963	return X86EMUL_PROPAGATE_FAULT;
				4964	}
				4965
				4966	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
				4967	return X86EMUL_CONTINUE;
				4968
				4969	/*
				4970	* Is this MMIO handled locally?
				4971	*/
				4972	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
				4973	if (handled == bytes)
				4974	return X86EMUL_CONTINUE;
				4975
				4976	gpa += handled;
				4977	bytes -= handled;
				4978	val += handled;
				4979
				4980	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
				4981	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
				4982	frag->gpa = gpa;
				4983	frag->data = val;
				4984	frag->len = bytes;
				4985	return X86EMUL_CONTINUE;
				4986	}
				4987
				4988	static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
				4989	unsigned long addr,
				4990	void *val, unsigned int bytes,
				4991	struct x86_exception *exception,
				4992	const struct read_write_emulator_ops *ops)
				4993	{
				4994	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				4995	gpa_t gpa;
				4996	int rc;
				4997
				4998	if (ops->read_write_prepare &&
				4999	ops->read_write_prepare(vcpu, val, bytes))
				5000	return X86EMUL_CONTINUE;
				5001
				5002	vcpu->mmio_nr_fragments = 0;
				5003
				5004	/* Crossing a page boundary? */
				5005	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
				5006	int now;
				5007
				5008	now = -addr & ~PAGE_MASK;
				5009	rc = emulator_read_write_onepage(addr, val, now, exception,
				5010	vcpu, ops);
				5011
				5012	if (rc != X86EMUL_CONTINUE)
				5013	return rc;
				5014	addr += now;
				5015	if (ctxt->mode != X86EMUL_MODE_PROT64)
				5016	addr = (u32)addr;
				5017	val += now;
				5018	bytes -= now;
				5019	}
				5020
				5021	rc = emulator_read_write_onepage(addr, val, bytes, exception,
				5022	vcpu, ops);
				5023	if (rc != X86EMUL_CONTINUE)
				5024	return rc;
				5025
				5026	if (!vcpu->mmio_nr_fragments)
				5027	return rc;
				5028
				5029	gpa = vcpu->mmio_fragments[0].gpa;
				5030
				5031	vcpu->mmio_needed = 1;
				5032	vcpu->mmio_cur_fragment = 0;
				5033
				5034	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
				5035	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
				5036	vcpu->run->exit_reason = KVM_EXIT_MMIO;
				5037	vcpu->run->mmio.phys_addr = gpa;
				5038
				5039	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
				5040	}
				5041
				5042	static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
				5043	unsigned long addr,
				5044	void *val,
				5045	unsigned int bytes,
				5046	struct x86_exception *exception)
				5047	{
				5048	return emulator_read_write(ctxt, addr, val, bytes,
				5049	exception, &read_emultor);
				5050	}
				5051
				5052	static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
				5053	unsigned long addr,
				5054	const void *val,
				5055	unsigned int bytes,
				5056	struct x86_exception *exception)
				5057	{
				5058	return emulator_read_write(ctxt, addr, (void *)val, bytes,
				5059	exception, &write_emultor);
				5060	}
				5061
				5062	#define CMPXCHG_TYPE(t, ptr, old, new) \
				5063	(cmpxchg((t )(ptr), (t )(old), (t )(new)) == (t *)(old))
				5064
				5065	#ifdef CONFIG_X86_64
				5066	# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
				5067	#else
				5068	# define CMPXCHG64(ptr, old, new) \
				5069	(cmpxchg64((u64 )(ptr), (u64 )(old), (u64 )(new)) == (u64 *)(old))
				5070	#endif
				5071
				5072	static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
				5073	unsigned long addr,
				5074	const void *old,
				5075	const void *new,
				5076	unsigned int bytes,
				5077	struct x86_exception *exception)
				5078	{
				5079	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5080	gpa_t gpa;
				5081	struct page *page;
				5082	char *kaddr;
				5083	bool exchanged;
				5084
				5085	/* guests cmpxchg8b have to be emulated atomically */
				5086	if (bytes > 8 \|\| (bytes & (bytes - 1)))
				5087	goto emul_write;
				5088
				5089	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
				5090
				5091	if (gpa == UNMAPPED_GVA \|\|
				5092	(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
				5093	goto emul_write;
				5094
				5095	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
				5096	goto emul_write;
				5097
				5098	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
				5099	if (is_error_page(page))
				5100	goto emul_write;
				5101
				5102	kaddr = kmap_atomic(page);
				5103	kaddr += offset_in_page(gpa);
				5104	switch (bytes) {
				5105	case 1:
				5106	exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
				5107	break;
				5108	case 2:
				5109	exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
				5110	break;
				5111	case 4:
				5112	exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
				5113	break;
				5114	case 8:
				5115	exchanged = CMPXCHG64(kaddr, old, new);
				5116	break;
				5117	default:
				5118	BUG();
				5119	}
				5120	kunmap_atomic(kaddr);
				5121	kvm_release_page_dirty(page);
				5122
				5123	if (!exchanged)
				5124	return X86EMUL_CMPXCHG_FAILED;
				5125
				5126	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
				5127	kvm_page_track_write(vcpu, gpa, new, bytes);
				5128
				5129	return X86EMUL_CONTINUE;
				5130
				5131	emul_write:
				5132	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
				5133
				5134	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
				5135	}
				5136
				5137	static int kernel_pio(struct kvm_vcpu vcpu, void pd)
				5138	{
				5139	int r = 0, i;
				5140
				5141	for (i = 0; i < vcpu->arch.pio.count; i++) {
				5142	if (vcpu->arch.pio.in)
				5143	r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
				5144	vcpu->arch.pio.size, pd);
				5145	else
				5146	r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
				5147	vcpu->arch.pio.port, vcpu->arch.pio.size,
				5148	pd);
				5149	if (r)
				5150	break;
				5151	pd += vcpu->arch.pio.size;
				5152	}
				5153	return r;
				5154	}
				5155
				5156	static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
				5157	unsigned short port, void *val,
				5158	unsigned int count, bool in)
				5159	{
				5160	vcpu->arch.pio.port = port;
				5161	vcpu->arch.pio.in = in;
				5162	vcpu->arch.pio.count = count;
				5163	vcpu->arch.pio.size = size;
				5164
				5165	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
				5166	vcpu->arch.pio.count = 0;
				5167	return 1;
				5168	}
				5169
				5170	vcpu->run->exit_reason = KVM_EXIT_IO;
				5171	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
				5172	vcpu->run->io.size = size;
				5173	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
				5174	vcpu->run->io.count = count;
				5175	vcpu->run->io.port = port;
				5176
				5177	return 0;
				5178	}
				5179
				5180	static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
				5181	int size, unsigned short port, void *val,
				5182	unsigned int count)
				5183	{
				5184	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5185	int ret;
				5186
				5187	if (vcpu->arch.pio.count)
				5188	goto data_avail;
				5189
				5190	memset(vcpu->arch.pio_data, 0, size * count);
				5191
				5192	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
				5193	if (ret) {
				5194	data_avail:
				5195	memcpy(val, vcpu->arch.pio_data, size * count);
				5196	trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
				5197	vcpu->arch.pio.count = 0;
				5198	return 1;
				5199	}
				5200
				5201	return 0;
				5202	}
				5203
				5204	static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
				5205	int size, unsigned short port,
				5206	const void *val, unsigned int count)
				5207	{
				5208	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5209
				5210	memcpy(vcpu->arch.pio_data, val, size * count);
				5211	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
				5212	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
				5213	}
				5214
				5215	static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
				5216	{
				5217	return kvm_x86_ops->get_segment_base(vcpu, seg);
				5218	}
				5219
				5220	static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
				5221	{
				5222	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
				5223	}
				5224
				5225	static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
				5226	{
				5227	if (!need_emulate_wbinvd(vcpu))
				5228	return X86EMUL_CONTINUE;
				5229
				5230	if (kvm_x86_ops->has_wbinvd_exit()) {
				5231	int cpu = get_cpu();
				5232
				5233	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
				5234	smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
				5235	wbinvd_ipi, NULL, 1);
				5236	put_cpu();
				5237	cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
				5238	} else
				5239	wbinvd();
				5240	return X86EMUL_CONTINUE;
				5241	}
				5242
				5243	int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
				5244	{
				5245	kvm_emulate_wbinvd_noskip(vcpu);
				5246	return kvm_skip_emulated_instruction(vcpu);
				5247	}
				5248	EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
				5249
				5250
				5251
				5252	static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
				5253	{
				5254	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
				5255	}
				5256
				5257	static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
				5258	unsigned long *dest)
				5259	{
				5260	return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
				5261	}
				5262
				5263	static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
				5264	unsigned long value)
				5265	{
				5266
				5267	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
				5268	}
				5269
				5270	static u64 mk_cr_64(u64 curr_cr, u32 new_val)
				5271	{
				5272	return (curr_cr & ~((1ULL << 32) - 1)) \| new_val;
				5273	}
				5274
				5275	static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
				5276	{
				5277	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5278	unsigned long value;
				5279
				5280	switch (cr) {
				5281	case 0:
				5282	value = kvm_read_cr0(vcpu);
				5283	break;
				5284	case 2:
				5285	value = vcpu->arch.cr2;
				5286	break;
				5287	case 3:
				5288	value = kvm_read_cr3(vcpu);
				5289	break;
				5290	case 4:
				5291	value = kvm_read_cr4(vcpu);
				5292	break;
				5293	case 8:
				5294	value = kvm_get_cr8(vcpu);
				5295	break;
				5296	default:
				5297	kvm_err("%s: unexpected cr %u\n", __func__, cr);
				5298	return 0;
				5299	}
				5300
				5301	return value;
				5302	}
				5303
				5304	static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
				5305	{
				5306	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5307	int res = 0;
				5308
				5309	switch (cr) {
				5310	case 0:
				5311	res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
				5312	break;
				5313	case 2:
				5314	vcpu->arch.cr2 = val;
				5315	break;
				5316	case 3:
				5317	res = kvm_set_cr3(vcpu, val);
				5318	break;
				5319	case 4:
				5320	res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
				5321	break;
				5322	case 8:
				5323	res = kvm_set_cr8(vcpu, val);
				5324	break;
				5325	default:
				5326	kvm_err("%s: unexpected cr %u\n", __func__, cr);
				5327	res = -1;
				5328	}
				5329
				5330	return res;
				5331	}
				5332
				5333	static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
				5334	{
				5335	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
				5336	}
				5337
				5338	static void emulator_get_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				5339	{
				5340	kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
				5341	}
				5342
				5343	static void emulator_get_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				5344	{
				5345	kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
				5346	}
				5347
				5348	static void emulator_set_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				5349	{
				5350	kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
				5351	}
				5352
				5353	static void emulator_set_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				5354	{
				5355	kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
				5356	}
				5357
				5358	static unsigned long emulator_get_cached_segment_base(
				5359	struct x86_emulate_ctxt *ctxt, int seg)
				5360	{
				5361	return get_segment_base(emul_to_vcpu(ctxt), seg);
				5362	}
				5363
				5364	static bool emulator_get_segment(struct x86_emulate_ctxt ctxt, u16 selector,
				5365	struct desc_struct desc, u32 base3,
				5366	int seg)
				5367	{
				5368	struct kvm_segment var;
				5369
				5370	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
				5371	*selector = var.selector;
				5372
				5373	if (var.unusable) {
				5374	memset(desc, 0, sizeof(*desc));
				5375	if (base3)
				5376	*base3 = 0;
				5377	return false;
				5378	}
				5379
				5380	if (var.g)
				5381	var.limit >>= 12;
				5382	set_desc_limit(desc, var.limit);
				5383	set_desc_base(desc, (unsigned long)var.base);
				5384	#ifdef CONFIG_X86_64
				5385	if (base3)
				5386	*base3 = var.base >> 32;
				5387	#endif
				5388	desc->type = var.type;
				5389	desc->s = var.s;
				5390	desc->dpl = var.dpl;
				5391	desc->p = var.present;
				5392	desc->avl = var.avl;
				5393	desc->l = var.l;
				5394	desc->d = var.db;
				5395	desc->g = var.g;
				5396
				5397	return true;
				5398	}
				5399
				5400	static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
				5401	struct desc_struct *desc, u32 base3,
				5402	int seg)
				5403	{
				5404	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5405	struct kvm_segment var;
				5406
				5407	var.selector = selector;
				5408	var.base = get_desc_base(desc);
				5409	#ifdef CONFIG_X86_64
				5410	var.base \|= ((u64)base3) << 32;
				5411	#endif
				5412	var.limit = get_desc_limit(desc);
				5413	if (desc->g)
				5414	var.limit = (var.limit << 12) \| 0xfff;
				5415	var.type = desc->type;
				5416	var.dpl = desc->dpl;
				5417	var.db = desc->d;
				5418	var.s = desc->s;
				5419	var.l = desc->l;
				5420	var.g = desc->g;
				5421	var.avl = desc->avl;
				5422	var.present = desc->p;
				5423	var.unusable = !var.present;
				5424	var.padding = 0;
				5425
				5426	kvm_set_segment(vcpu, &var, seg);
				5427	return;
				5428	}
				5429
				5430	static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
				5431	u32 msr_index, u64 *pdata)
				5432	{
				5433	struct msr_data msr;
				5434	int r;
				5435
				5436	msr.index = msr_index;
				5437	msr.host_initiated = false;
				5438	r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
				5439	if (r)
				5440	return r;
				5441
				5442	*pdata = msr.data;
				5443	return 0;
				5444	}
				5445
				5446	static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
				5447	u32 msr_index, u64 data)
				5448	{
				5449	struct msr_data msr;
				5450
				5451	msr.data = data;
				5452	msr.index = msr_index;
				5453	msr.host_initiated = false;
				5454	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
				5455	}
				5456
				5457	static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
				5458	{
				5459	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5460
				5461	return vcpu->arch.smbase;
				5462	}
				5463
				5464	static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
				5465	{
				5466	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5467
				5468	vcpu->arch.smbase = smbase;
				5469	}
				5470
				5471	static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
				5472	u32 pmc)
				5473	{
				5474	return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
				5475	}
				5476
				5477	static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
				5478	u32 pmc, u64 *pdata)
				5479	{
				5480	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
				5481	}
				5482
				5483	static void emulator_halt(struct x86_emulate_ctxt *ctxt)
				5484	{
				5485	emul_to_vcpu(ctxt)->arch.halt_request = 1;
				5486	}
				5487
				5488	static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
				5489	{
				5490	}
				5491
				5492	static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
				5493	{
				5494	}
				5495
				5496	static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
				5497	struct x86_instruction_info *info,
				5498	enum x86_intercept_stage stage)
				5499	{
				5500	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
				5501	}
				5502
				5503	static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
				5504	u32 eax, u32 ebx, u32 ecx, u32 edx, bool check_limit)
				5505	{
				5506	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
				5507	}
				5508
				5509	static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
				5510	{
				5511	return kvm_register_read(emul_to_vcpu(ctxt), reg);
				5512	}
				5513
				5514	static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
				5515	{
				5516	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
				5517	}
				5518
				5519	static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
				5520	{
				5521	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
				5522	}
				5523
				5524	static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
				5525	{
				5526	return emul_to_vcpu(ctxt)->arch.hflags;
				5527	}
				5528
				5529	static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
				5530	{
				5531	kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
				5532	}
				5533
				5534	static const struct x86_emulate_ops emulate_ops = {
				5535	.read_gpr = emulator_read_gpr,
				5536	.write_gpr = emulator_write_gpr,
				5537	.read_std = emulator_read_std,
				5538	.write_std = emulator_write_std,
				5539	.read_phys = kvm_read_guest_phys_system,
				5540	.fetch = kvm_fetch_guest_virt,
				5541	.read_emulated = emulator_read_emulated,
				5542	.write_emulated = emulator_write_emulated,
				5543	.cmpxchg_emulated = emulator_cmpxchg_emulated,
				5544	.invlpg = emulator_invlpg,
				5545	.pio_in_emulated = emulator_pio_in_emulated,
				5546	.pio_out_emulated = emulator_pio_out_emulated,
				5547	.get_segment = emulator_get_segment,
				5548	.set_segment = emulator_set_segment,
				5549	.get_cached_segment_base = emulator_get_cached_segment_base,
				5550	.get_gdt = emulator_get_gdt,
				5551	.get_idt = emulator_get_idt,
				5552	.set_gdt = emulator_set_gdt,
				5553	.set_idt = emulator_set_idt,
				5554	.get_cr = emulator_get_cr,
				5555	.set_cr = emulator_set_cr,
				5556	.cpl = emulator_get_cpl,
				5557	.get_dr = emulator_get_dr,
				5558	.set_dr = emulator_set_dr,
				5559	.get_smbase = emulator_get_smbase,
				5560	.set_smbase = emulator_set_smbase,
				5561	.set_msr = emulator_set_msr,
				5562	.get_msr = emulator_get_msr,
				5563	.check_pmc = emulator_check_pmc,
				5564	.read_pmc = emulator_read_pmc,
				5565	.halt = emulator_halt,
				5566	.wbinvd = emulator_wbinvd,
				5567	.fix_hypercall = emulator_fix_hypercall,
				5568	.get_fpu = emulator_get_fpu,
				5569	.put_fpu = emulator_put_fpu,
				5570	.intercept = emulator_intercept,
				5571	.get_cpuid = emulator_get_cpuid,
				5572	.set_nmi_mask = emulator_set_nmi_mask,
				5573	.get_hflags = emulator_get_hflags,
				5574	.set_hflags = emulator_set_hflags,
				5575	};
				5576
				5577	static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
				5578	{
				5579	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
				5580	/*
				5581	* an sti; sti; sequence only disable interrupts for the first
				5582	* instruction. So, if the last instruction, be it emulated or
				5583	* not, left the system with the INT_STI flag enabled, it
				5584	* means that the last instruction is an sti. We should not
				5585	* leave the flag on in this case. The same goes for mov ss
				5586	*/
				5587	if (int_shadow & mask)
				5588	mask = 0;
				5589	if (unlikely(int_shadow \|\| mask)) {
				5590	kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
				5591	if (!mask)
				5592	kvm_make_request(KVM_REQ_EVENT, vcpu);
				5593	}
				5594	}
				5595
				5596	static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
				5597	{
				5598	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				5599	if (ctxt->exception.vector == PF_VECTOR)
				5600	return kvm_propagate_fault(vcpu, &ctxt->exception);
				5601
				5602	if (ctxt->exception.error_code_valid)
				5603	kvm_queue_exception_e(vcpu, ctxt->exception.vector,
				5604	ctxt->exception.error_code);
				5605	else
				5606	kvm_queue_exception(vcpu, ctxt->exception.vector);
				5607	return false;
				5608	}
				5609
				5610	static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
				5611	{
				5612	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				5613	int cs_db, cs_l;
				5614
				5615	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
				5616
				5617	ctxt->eflags = kvm_get_rflags(vcpu);
				5618	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
				5619
				5620	ctxt->eip = kvm_rip_read(vcpu);
				5621	ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
				5622	(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
				5623	(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
				5624	cs_db ? X86EMUL_MODE_PROT32 :
				5625	X86EMUL_MODE_PROT16;
				5626	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
				5627	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
				5628	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
				5629
				5630	init_decode_cache(ctxt);
				5631	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
				5632	}
				5633
				5634	int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
				5635	{
				5636	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				5637	int ret;
				5638
				5639	init_emulate_ctxt(vcpu);
				5640
				5641	ctxt->op_bytes = 2;
				5642	ctxt->ad_bytes = 2;
				5643	ctxt->_eip = ctxt->eip + inc_eip;
				5644	ret = emulate_int_real(ctxt, irq);
				5645
				5646	if (ret != X86EMUL_CONTINUE)
				5647	return EMULATE_FAIL;
				5648
				5649	ctxt->eip = ctxt->_eip;
				5650	kvm_rip_write(vcpu, ctxt->eip);
				5651	kvm_set_rflags(vcpu, ctxt->eflags);
				5652
				5653	if (irq == NMI_VECTOR)
				5654	vcpu->arch.nmi_pending = 0;
				5655	else
				5656	vcpu->arch.interrupt.pending = false;
				5657
				5658	return EMULATE_DONE;
				5659	}
				5660	EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
				5661
				5662	static int handle_emulation_failure(struct kvm_vcpu *vcpu)
				5663	{
				5664	int r = EMULATE_DONE;
				5665
				5666	++vcpu->stat.insn_emulation_fail;
				5667	trace_kvm_emulate_insn_failed(vcpu);
				5668	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
				5669	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
				5670	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
				5671	vcpu->run->internal.ndata = 0;
				5672	r = EMULATE_USER_EXIT;
				5673	}
				5674	kvm_queue_exception(vcpu, UD_VECTOR);
				5675
				5676	return r;
				5677	}
				5678
				5679	static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
				5680	bool write_fault_to_shadow_pgtable,
				5681	int emulation_type)
				5682	{
				5683	gpa_t gpa = cr2;
				5684	kvm_pfn_t pfn;
				5685
				5686	if (emulation_type & EMULTYPE_NO_REEXECUTE)
				5687	return false;
				5688
				5689	if (!vcpu->arch.mmu.direct_map) {
				5690	/*
				5691	* Write permission should be allowed since only
				5692	* write access need to be emulated.
				5693	*/
				5694	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
				5695
				5696	/*
				5697	* If the mapping is invalid in guest, let cpu retry
				5698	* it to generate fault.
				5699	*/
				5700	if (gpa == UNMAPPED_GVA)
				5701	return true;
				5702	}
				5703
				5704	/*
				5705	* Do not retry the unhandleable instruction if it faults on the
				5706	* readonly host memory, otherwise it will goto a infinite loop:
				5707	* retry instruction -> write #PF -> emulation fail -> retry
				5708	* instruction -> ...
				5709	*/
				5710	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
				5711
				5712	/*
				5713	* If the instruction failed on the error pfn, it can not be fixed,
				5714	* report the error to userspace.
				5715	*/
				5716	if (is_error_noslot_pfn(pfn))
				5717	return false;
				5718
				5719	kvm_release_pfn_clean(pfn);
				5720
				5721	/* The instructions are well-emulated on direct mmu. */
				5722	if (vcpu->arch.mmu.direct_map) {
				5723	unsigned int indirect_shadow_pages;
				5724
				5725	spin_lock(&vcpu->kvm->mmu_lock);
				5726	indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
				5727	spin_unlock(&vcpu->kvm->mmu_lock);
				5728
				5729	if (indirect_shadow_pages)
				5730	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
				5731
				5732	return true;
				5733	}
				5734
				5735	/*
				5736	* if emulation was due to access to shadowed page table
				5737	* and it failed try to unshadow page and re-enter the
				5738	* guest to let CPU execute the instruction.
				5739	*/
				5740	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
				5741
				5742	/*
				5743	* If the access faults on its page table, it can not
				5744	* be fixed by unprotecting shadow page and it should
				5745	* be reported to userspace.
				5746	*/
				5747	return !write_fault_to_shadow_pgtable;
				5748	}
				5749
				5750	static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
				5751	unsigned long cr2, int emulation_type)
				5752	{
				5753	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5754	unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
				5755
				5756	last_retry_eip = vcpu->arch.last_retry_eip;
				5757	last_retry_addr = vcpu->arch.last_retry_addr;
				5758
				5759	/*
				5760	* If the emulation is caused by #PF and it is non-page_table
				5761	* writing instruction, it means the VM-EXIT is caused by shadow
				5762	* page protected, we can zap the shadow page and retry this
				5763	* instruction directly.
				5764	*
				5765	* Note: if the guest uses a non-page-table modifying instruction
				5766	* on the PDE that points to the instruction, then we will unmap
				5767	* the instruction and go to an infinite loop. So, we cache the
				5768	* last retried eip and the last fault address, if we meet the eip
				5769	* and the address again, we can break out of the potential infinite
				5770	* loop.
				5771	*/
				5772	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
				5773
				5774	if (!(emulation_type & EMULTYPE_RETRY))
				5775	return false;
				5776
				5777	if (x86_page_table_writing_insn(ctxt))
				5778	return false;
				5779
				5780	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
				5781	return false;
				5782
				5783	vcpu->arch.last_retry_eip = ctxt->eip;
				5784	vcpu->arch.last_retry_addr = cr2;
				5785
				5786	if (!vcpu->arch.mmu.direct_map)
				5787	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
				5788
				5789	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
				5790
				5791	return true;
				5792	}
				5793
				5794	static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
				5795	static int complete_emulated_pio(struct kvm_vcpu *vcpu);
				5796
				5797	static void kvm_smm_changed(struct kvm_vcpu *vcpu)
				5798	{
				5799	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
				5800	/* This is a good place to trace that we are exiting SMM. */
				5801	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
				5802
				5803	/* Process a latched INIT or SMI, if any. */
				5804	kvm_make_request(KVM_REQ_EVENT, vcpu);
				5805	}
				5806
				5807	kvm_mmu_reset_context(vcpu);
				5808	}
				5809
				5810	static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
				5811	{
				5812	unsigned changed = vcpu->arch.hflags ^ emul_flags;
				5813
				5814	vcpu->arch.hflags = emul_flags;
				5815
				5816	if (changed & HF_SMM_MASK)
				5817	kvm_smm_changed(vcpu);
				5818	}
				5819
				5820	static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
				5821	unsigned long *db)
				5822	{
				5823	u32 dr6 = 0;
				5824	int i;
				5825	u32 enable, rwlen;
				5826
				5827	enable = dr7;
				5828	rwlen = dr7 >> 16;
				5829	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
				5830	if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
				5831	dr6 \|= (1 << i);
				5832	return dr6;
				5833	}
				5834
				5835	static void kvm_vcpu_do_singlestep(struct kvm_vcpu vcpu, int r)
				5836	{
				5837	struct kvm_run *kvm_run = vcpu->run;
				5838
				5839	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
				5840	kvm_run->debug.arch.dr6 = DR6_BS \| DR6_FIXED_1 \| DR6_RTM;
				5841	kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
				5842	kvm_run->debug.arch.exception = DB_VECTOR;
				5843	kvm_run->exit_reason = KVM_EXIT_DEBUG;
				5844	*r = EMULATE_USER_EXIT;
				5845	} else {
				5846	/*
				5847	* "Certain debug exceptions may clear bit 0-3. The
				5848	* remaining contents of the DR6 register are never
				5849	* cleared by the processor".
				5850	*/
				5851	vcpu->arch.dr6 &= ~15;
				5852	vcpu->arch.dr6 \|= DR6_BS \| DR6_RTM;
				5853	kvm_queue_exception(vcpu, DB_VECTOR);
				5854	}
				5855	}
				5856
				5857	int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
				5858	{
				5859	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
				5860	int r = EMULATE_DONE;
				5861
				5862	kvm_x86_ops->skip_emulated_instruction(vcpu);
				5863
				5864	/*
				5865	* rflags is the old, "raw" value of the flags. The new value has
				5866	* not been saved yet.
				5867	*
				5868	* This is correct even for TF set by the guest, because "the
				5869	* processor will not generate this exception after the instruction
				5870	* that sets the TF flag".
				5871	*/
				5872	if (unlikely(rflags & X86_EFLAGS_TF))
				5873	kvm_vcpu_do_singlestep(vcpu, &r);
				5874	return r == EMULATE_DONE;
				5875	}
				5876	EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
				5877
				5878	static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu vcpu, int r)
				5879	{
				5880	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
				5881	(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
				5882	struct kvm_run *kvm_run = vcpu->run;
				5883	unsigned long eip = kvm_get_linear_rip(vcpu);
				5884	u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
				5885	vcpu->arch.guest_debug_dr7,
				5886	vcpu->arch.eff_db);
				5887
				5888	if (dr6 != 0) {
				5889	kvm_run->debug.arch.dr6 = dr6 \| DR6_FIXED_1 \| DR6_RTM;
				5890	kvm_run->debug.arch.pc = eip;
				5891	kvm_run->debug.arch.exception = DB_VECTOR;
				5892	kvm_run->exit_reason = KVM_EXIT_DEBUG;
				5893	*r = EMULATE_USER_EXIT;
				5894	return true;
				5895	}
				5896	}
				5897
				5898	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
				5899	!(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
				5900	unsigned long eip = kvm_get_linear_rip(vcpu);
				5901	u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
				5902	vcpu->arch.dr7,
				5903	vcpu->arch.db);
				5904
				5905	if (dr6 != 0) {
				5906	vcpu->arch.dr6 &= ~15;
				5907	vcpu->arch.dr6 \|= dr6 \| DR6_RTM;
				5908	kvm_queue_exception(vcpu, DB_VECTOR);
				5909	*r = EMULATE_DONE;
				5910	return true;
				5911	}
				5912	}
				5913
				5914	return false;
				5915	}
				5916
				5917	int x86_emulate_instruction(struct kvm_vcpu *vcpu,
				5918	unsigned long cr2,
				5919	int emulation_type,
				5920	void *insn,
				5921	int insn_len)
				5922	{
				5923	int r;
				5924	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				5925	bool writeback = true;
				5926	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
				5927
				5928	vcpu->arch.l1tf_flush_l1d = true;
				5929
				5930	/*
				5931	* Clear write_fault_to_shadow_pgtable here to ensure it is
				5932	* never reused.
				5933	*/
				5934	vcpu->arch.write_fault_to_shadow_pgtable = false;
				5935	kvm_clear_exception_queue(vcpu);
				5936
				5937	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
				5938	init_emulate_ctxt(vcpu);
				5939
				5940	/*
				5941	* We will reenter on the same instruction since
				5942	* we do not set complete_userspace_io. This does not
				5943	* handle watchpoints yet, those would be handled in
				5944	* the emulate_ops.
				5945	*/
				5946	if (!(emulation_type & EMULTYPE_SKIP) &&
				5947	kvm_vcpu_check_breakpoint(vcpu, &r))
				5948	return r;
				5949
				5950	ctxt->interruptibility = 0;
				5951	ctxt->have_exception = false;
				5952	ctxt->exception.vector = -1;
				5953	ctxt->perm_ok = false;
				5954
				5955	ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
				5956
				5957	r = x86_decode_insn(ctxt, insn, insn_len);
				5958
				5959	trace_kvm_emulate_insn_start(vcpu);
				5960	++vcpu->stat.insn_emulation;
				5961	if (r != EMULATION_OK) {
				5962	if (emulation_type & EMULTYPE_TRAP_UD)
				5963	return EMULATE_FAIL;
				5964	if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
				5965	emulation_type))
				5966	return EMULATE_DONE;
				5967	if (ctxt->have_exception) {
				5968	/*
				5969	* #UD should result in just EMULATION_FAILED, and trap-like
				5970	* exception should not be encountered during decode.
				5971	*/
				5972	WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR \|\|
				5973	exception_type(ctxt->exception.vector) == EXCPT_TRAP);
				5974	inject_emulated_exception(vcpu);
				5975	return EMULATE_DONE;
				5976	}
				5977	if (emulation_type & EMULTYPE_SKIP)
				5978	return EMULATE_FAIL;
				5979	return handle_emulation_failure(vcpu);
				5980	}
				5981	}
				5982
				5983	if (emulation_type & EMULTYPE_SKIP) {
				5984	kvm_rip_write(vcpu, ctxt->_eip);
				5985	if (ctxt->eflags & X86_EFLAGS_RF)
				5986	kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
				5987	return EMULATE_DONE;
				5988	}
				5989
				5990	if (retry_instruction(ctxt, cr2, emulation_type))
				5991	return EMULATE_DONE;
				5992
				5993	/* this is needed for vmware backdoor interface to work since it
				5994	changes registers values during IO operation */
				5995	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
				5996	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
				5997	emulator_invalidate_register_cache(ctxt);
				5998	}
				5999
				6000	restart:
				6001	/* Save the faulting GPA (cr2) in the address field */
				6002	ctxt->exception.address = cr2;
				6003
				6004	r = x86_emulate_insn(ctxt);
				6005
				6006	if (r == EMULATION_INTERCEPTED)
				6007	return EMULATE_DONE;
				6008
				6009	if (r == EMULATION_FAILED) {
				6010	if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
				6011	emulation_type))
				6012	return EMULATE_DONE;
				6013
				6014	return handle_emulation_failure(vcpu);
				6015	}
				6016
				6017	if (ctxt->have_exception) {
				6018	r = EMULATE_DONE;
				6019	if (inject_emulated_exception(vcpu))
				6020	return r;
				6021	} else if (vcpu->arch.pio.count) {
				6022	if (!vcpu->arch.pio.in) {
				6023	/* FIXME: return into emulator if single-stepping. */
				6024	vcpu->arch.pio.count = 0;
				6025	} else {
				6026	writeback = false;
				6027	vcpu->arch.complete_userspace_io = complete_emulated_pio;
				6028	}
				6029	r = EMULATE_USER_EXIT;
				6030	} else if (vcpu->mmio_needed) {
				6031	if (!vcpu->mmio_is_write)
				6032	writeback = false;
				6033	r = EMULATE_USER_EXIT;
				6034	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
				6035	} else if (r == EMULATION_RESTART)
				6036	goto restart;
				6037	else
				6038	r = EMULATE_DONE;
				6039
				6040	if (writeback) {
				6041	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
				6042	toggle_interruptibility(vcpu, ctxt->interruptibility);
				6043	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
				6044	if (!ctxt->have_exception \|\|
				6045	exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
				6046	kvm_rip_write(vcpu, ctxt->eip);
				6047	if (r == EMULATE_DONE && ctxt->tf)
				6048	kvm_vcpu_do_singlestep(vcpu, &r);
				6049	__kvm_set_rflags(vcpu, ctxt->eflags);
				6050	}
				6051
				6052	/*
				6053	* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
				6054	* do nothing, and it will be requested again as soon as
				6055	* the shadow expires. But we still need to check here,
				6056	* because POPF has no interrupt shadow.
				6057	*/
				6058	if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
				6059	kvm_make_request(KVM_REQ_EVENT, vcpu);
				6060	} else
				6061	vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
				6062
				6063	return r;
				6064	}
				6065	EXPORT_SYMBOL_GPL(x86_emulate_instruction);
				6066
				6067	int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
				6068	{
				6069	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
				6070	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
				6071	size, port, &val, 1);
				6072	/* do not return to emulator after return from userspace */
				6073	vcpu->arch.pio.count = 0;
				6074	return ret;
				6075	}
				6076	EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
				6077
				6078	static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
				6079	{
				6080	unsigned long val;
				6081
				6082	/* We should only ever be called with arch.pio.count equal to 1 */
				6083	BUG_ON(vcpu->arch.pio.count != 1);
				6084
				6085	/* For size less than 4 we merge, else we zero extend */
				6086	val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
				6087	: 0;
				6088
				6089	/*
				6090	* Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
				6091	* the copy and tracing
				6092	*/
				6093	emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
				6094	vcpu->arch.pio.port, &val, 1);
				6095	kvm_register_write(vcpu, VCPU_REGS_RAX, val);
				6096
				6097	return 1;
				6098	}
				6099
				6100	int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
				6101	{
				6102	unsigned long val;
				6103	int ret;
				6104
				6105	/* For size less than 4 we merge, else we zero extend */
				6106	val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
				6107
				6108	ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
				6109	&val, 1);
				6110	if (ret) {
				6111	kvm_register_write(vcpu, VCPU_REGS_RAX, val);
				6112	return ret;
				6113	}
				6114
				6115	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
				6116
				6117	return 0;
				6118	}
				6119	EXPORT_SYMBOL_GPL(kvm_fast_pio_in);
				6120
				6121	static int kvmclock_cpu_down_prep(unsigned int cpu)
				6122	{
				6123	__this_cpu_write(cpu_tsc_khz, 0);
				6124	return 0;
				6125	}
				6126
				6127	static void tsc_khz_changed(void *data)
				6128	{
				6129	struct cpufreq_freqs *freq = data;
				6130	unsigned long khz = 0;
				6131
				6132	if (data)
				6133	khz = freq->new;
				6134	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
				6135	khz = cpufreq_quick_get(raw_smp_processor_id());
				6136	if (!khz)
				6137	khz = tsc_khz;
				6138	__this_cpu_write(cpu_tsc_khz, khz);
				6139	}
				6140
				6141	static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
				6142	void *data)
				6143	{
				6144	struct cpufreq_freqs *freq = data;
				6145	struct kvm *kvm;
				6146	struct kvm_vcpu *vcpu;
				6147	int i, send_ipi = 0;
				6148
				6149	/*
				6150	* We allow guests to temporarily run on slowing clocks,
				6151	* provided we notify them after, or to run on accelerating
				6152	* clocks, provided we notify them before. Thus time never
				6153	* goes backwards.
				6154	*
				6155	* However, we have a problem. We can't atomically update
				6156	* the frequency of a given CPU from this function; it is
				6157	* merely a notifier, which can be called from any CPU.
				6158	* Changing the TSC frequency at arbitrary points in time
				6159	* requires a recomputation of local variables related to
				6160	* the TSC for each VCPU. We must flag these local variables
				6161	* to be updated and be sure the update takes place with the
				6162	* new frequency before any guests proceed.
				6163	*
				6164	* Unfortunately, the combination of hotplug CPU and frequency
				6165	* change creates an intractable locking scenario; the order
				6166	* of when these callouts happen is undefined with respect to
				6167	* CPU hotplug, and they can race with each other. As such,
				6168	* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
				6169	* undefined; you can actually have a CPU frequency change take
				6170	* place in between the computation of X and the setting of the
				6171	* variable. To protect against this problem, all updates of
				6172	* the per_cpu tsc_khz variable are done in an interrupt
				6173	* protected IPI, and all callers wishing to update the value
				6174	* must wait for a synchronous IPI to complete (which is trivial
				6175	* if the caller is on the CPU already). This establishes the
				6176	* necessary total order on variable updates.
				6177	*
				6178	* Note that because a guest time update may take place
				6179	* anytime after the setting of the VCPU's request bit, the
				6180	* correct TSC value must be set before the request. However,
				6181	* to ensure the update actually makes it to any guest which
				6182	* starts running in hardware virtualization between the set
				6183	* and the acquisition of the spinlock, we must also ping the
				6184	* CPU after setting the request bit.
				6185	*
				6186	*/
				6187
				6188	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
				6189	return 0;
				6190	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
				6191	return 0;
				6192
				6193	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
				6194
				6195	mutex_lock(&kvm_lock);
				6196	list_for_each_entry(kvm, &vm_list, vm_list) {
				6197	kvm_for_each_vcpu(i, vcpu, kvm) {
				6198	if (vcpu->cpu != freq->cpu)
				6199	continue;
				6200	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				6201	if (vcpu->cpu != raw_smp_processor_id())
				6202	send_ipi = 1;
				6203	}
				6204	}
				6205	mutex_unlock(&kvm_lock);
				6206
				6207	if (freq->old < freq->new && send_ipi) {
				6208	/*
				6209	* We upscale the frequency. Must make the guest
				6210	* doesn't see old kvmclock values while running with
				6211	* the new frequency, otherwise we risk the guest sees
				6212	* time go backwards.
				6213	*
				6214	* In case we update the frequency for another cpu
				6215	* (which might be in guest context) send an interrupt
				6216	* to kick the cpu out of guest context. Next time
				6217	* guest context is entered kvmclock will be updated,
				6218	* so the guest will not see stale values.
				6219	*/
				6220	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
				6221	}
				6222	return 0;
				6223	}
				6224
				6225	static struct notifier_block kvmclock_cpufreq_notifier_block = {
				6226	.notifier_call = kvmclock_cpufreq_notifier
				6227	};
				6228
				6229	static int kvmclock_cpu_online(unsigned int cpu)
				6230	{
				6231	tsc_khz_changed(NULL);
				6232	return 0;
				6233	}
				6234
				6235	static void kvm_timer_init(void)
				6236	{
				6237	max_tsc_khz = tsc_khz;
				6238
				6239	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
				6240	#ifdef CONFIG_CPU_FREQ
				6241	struct cpufreq_policy policy;
				6242	int cpu;
				6243
				6244	memset(&policy, 0, sizeof(policy));
				6245	cpu = get_cpu();
				6246	cpufreq_get_policy(&policy, cpu);
				6247	if (policy.cpuinfo.max_freq)
				6248	max_tsc_khz = policy.cpuinfo.max_freq;
				6249	put_cpu();
				6250	#endif
				6251	cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
				6252	CPUFREQ_TRANSITION_NOTIFIER);
				6253	}
				6254	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
				6255
				6256	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
				6257	kvmclock_cpu_online, kvmclock_cpu_down_prep);
				6258	}
				6259
				6260	static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
				6261
				6262	int kvm_is_in_guest(void)
				6263	{
				6264	return __this_cpu_read(current_vcpu) != NULL;
				6265	}
				6266
				6267	static int kvm_is_user_mode(void)
				6268	{
				6269	int user_mode = 3;
				6270
				6271	if (__this_cpu_read(current_vcpu))
				6272	user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
				6273
				6274	return user_mode != 0;
				6275	}
				6276
				6277	static unsigned long kvm_get_guest_ip(void)
				6278	{
				6279	unsigned long ip = 0;
				6280
				6281	if (__this_cpu_read(current_vcpu))
				6282	ip = kvm_rip_read(__this_cpu_read(current_vcpu));
				6283
				6284	return ip;
				6285	}
				6286
				6287	static struct perf_guest_info_callbacks kvm_guest_cbs = {
				6288	.is_in_guest = kvm_is_in_guest,
				6289	.is_user_mode = kvm_is_user_mode,
				6290	.get_guest_ip = kvm_get_guest_ip,
				6291	};
				6292
				6293	void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
				6294	{
				6295	__this_cpu_write(current_vcpu, vcpu);
				6296	}
				6297	EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
				6298
				6299	void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
				6300	{
				6301	__this_cpu_write(current_vcpu, NULL);
				6302	}
				6303	EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
				6304
				6305	#ifdef CONFIG_X86_64
				6306	static void pvclock_gtod_update_fn(struct work_struct *work)
				6307	{
				6308	struct kvm *kvm;
				6309
				6310	struct kvm_vcpu *vcpu;
				6311	int i;
				6312
				6313	mutex_lock(&kvm_lock);
				6314	list_for_each_entry(kvm, &vm_list, vm_list)
				6315	kvm_for_each_vcpu(i, vcpu, kvm)
				6316	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				6317	atomic_set(&kvm_guest_has_master_clock, 0);
				6318	mutex_unlock(&kvm_lock);
				6319	}
				6320
				6321	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
				6322
				6323	/*
				6324	* Notification about pvclock gtod data update.
				6325	*/
				6326	static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
				6327	void *priv)
				6328	{
				6329	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				6330	struct timekeeper *tk = priv;
				6331
				6332	update_pvclock_gtod(tk);
				6333
				6334	/* disable master clock if host does not trust, or does not
				6335	* use, TSC clocksource
				6336	*/
				6337	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
				6338	atomic_read(&kvm_guest_has_master_clock) != 0)
				6339	queue_work(system_long_wq, &pvclock_gtod_work);
				6340
				6341	return 0;
				6342	}
				6343
				6344	static struct notifier_block pvclock_gtod_notifier = {
				6345	.notifier_call = pvclock_gtod_notify,
				6346	};
				6347	#endif
				6348
				6349	int kvm_arch_init(void *opaque)
				6350	{
				6351	int r;
				6352	struct kvm_x86_ops *ops = opaque;
				6353
				6354	if (kvm_x86_ops) {
				6355	printk(KERN_ERR "kvm: already loaded the other module\n");
				6356	r = -EEXIST;
				6357	goto out;
				6358	}
				6359
				6360	if (!ops->cpu_has_kvm_support()) {
				6361	printk(KERN_ERR "kvm: no hardware support\n");
				6362	r = -EOPNOTSUPP;
				6363	goto out;
				6364	}
				6365	if (ops->disabled_by_bios()) {
				6366	printk(KERN_ERR "kvm: disabled by bios\n");
				6367	r = -EOPNOTSUPP;
				6368	goto out;
				6369	}
				6370
				6371	r = -ENOMEM;
				6372	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
				6373	if (!shared_msrs) {
				6374	printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
				6375	goto out;
				6376	}
				6377
				6378	r = kvm_mmu_module_init();
				6379	if (r)
				6380	goto out_free_percpu;
				6381
				6382	kvm_x86_ops = ops;
				6383
				6384	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
				6385	PT_DIRTY_MASK, PT64_NX_MASK, 0,
				6386	PT_PRESENT_MASK, 0, sme_me_mask);
				6387	kvm_timer_init();
				6388
				6389	perf_register_guest_info_callbacks(&kvm_guest_cbs);
				6390
				6391	if (boot_cpu_has(X86_FEATURE_XSAVE))
				6392	host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
				6393
				6394	kvm_lapic_init();
				6395	#ifdef CONFIG_X86_64
				6396	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
				6397	#endif
				6398
				6399	return 0;
				6400
				6401	out_free_percpu:
				6402	free_percpu(shared_msrs);
				6403	out:
				6404	return r;
				6405	}
				6406
				6407	void kvm_arch_exit(void)
				6408	{
				6409	kvm_lapic_exit();
				6410	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
				6411
				6412	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
				6413	cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
				6414	CPUFREQ_TRANSITION_NOTIFIER);
				6415	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
				6416	#ifdef CONFIG_X86_64
				6417	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
				6418	#endif
				6419	kvm_x86_ops = NULL;
				6420	kvm_mmu_module_exit();
				6421	free_percpu(shared_msrs);
				6422	}
				6423
				6424	int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
				6425	{
				6426	++vcpu->stat.halt_exits;
				6427	if (lapic_in_kernel(vcpu)) {
				6428	vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
				6429	return 1;
				6430	} else {
				6431	vcpu->run->exit_reason = KVM_EXIT_HLT;
				6432	return 0;
				6433	}
				6434	}
				6435	EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
				6436
				6437	int kvm_emulate_halt(struct kvm_vcpu *vcpu)
				6438	{
				6439	int ret = kvm_skip_emulated_instruction(vcpu);
				6440	/*
				6441	* TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
				6442	* KVM_EXIT_DEBUG here.
				6443	*/
				6444	return kvm_vcpu_halt(vcpu) && ret;
				6445	}
				6446	EXPORT_SYMBOL_GPL(kvm_emulate_halt);
				6447
				6448	#ifdef CONFIG_X86_64
				6449	static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
				6450	unsigned long clock_type)
				6451	{
				6452	struct kvm_clock_pairing clock_pairing;
				6453	struct timespec ts;
				6454	u64 cycle;
				6455	int ret;
				6456
				6457	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
				6458	return -KVM_EOPNOTSUPP;
				6459
				6460	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
				6461	return -KVM_EOPNOTSUPP;
				6462
				6463	clock_pairing.sec = ts.tv_sec;
				6464	clock_pairing.nsec = ts.tv_nsec;
				6465	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
				6466	clock_pairing.flags = 0;
				6467	memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
				6468
				6469	ret = 0;
				6470	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
				6471	sizeof(struct kvm_clock_pairing)))
				6472	ret = -KVM_EFAULT;
				6473
				6474	return ret;
				6475	}
				6476	#endif
				6477
				6478	/*
				6479	* kvm_pv_kick_cpu_op: Kick a vcpu.
				6480	*
				6481	* @apicid - apicid of vcpu to be kicked.
				6482	*/
				6483	static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
				6484	{
				6485	struct kvm_lapic_irq lapic_irq;
				6486
				6487	lapic_irq.shorthand = 0;
				6488	lapic_irq.dest_mode = 0;
				6489	lapic_irq.level = 0;
				6490	lapic_irq.dest_id = apicid;
				6491	lapic_irq.msi_redir_hint = false;
				6492
				6493	lapic_irq.delivery_mode = APIC_DM_REMRD;
				6494	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
				6495	}
				6496
				6497	void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
				6498	{
				6499	vcpu->arch.apicv_active = false;
				6500	kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
				6501	}
				6502
				6503	int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
				6504	{
				6505	unsigned long nr, a0, a1, a2, a3, ret;
				6506	int op_64_bit;
				6507
				6508	if (kvm_hv_hypercall_enabled(vcpu->kvm)) {
				6509	if (!kvm_hv_hypercall(vcpu))
				6510	return 0;
				6511	goto out;
				6512	}
				6513
				6514	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
				6515	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
				6516	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
				6517	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
				6518	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
				6519
				6520	trace_kvm_hypercall(nr, a0, a1, a2, a3);
				6521
				6522	op_64_bit = is_64_bit_mode(vcpu);
				6523	if (!op_64_bit) {
				6524	nr &= 0xFFFFFFFF;
				6525	a0 &= 0xFFFFFFFF;
				6526	a1 &= 0xFFFFFFFF;
				6527	a2 &= 0xFFFFFFFF;
				6528	a3 &= 0xFFFFFFFF;
				6529	}
				6530
				6531	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
				6532	ret = -KVM_EPERM;
				6533	goto out_error;
				6534	}
				6535
				6536	switch (nr) {
				6537	case KVM_HC_VAPIC_POLL_IRQ:
				6538	ret = 0;
				6539	break;
				6540	case KVM_HC_KICK_CPU:
				6541	kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
				6542	ret = 0;
				6543	break;
				6544	#ifdef CONFIG_X86_64
				6545	case KVM_HC_CLOCK_PAIRING:
				6546	ret = kvm_pv_clock_pairing(vcpu, a0, a1);
				6547	break;
				6548	#endif
				6549	default:
				6550	ret = -KVM_ENOSYS;
				6551	break;
				6552	}
				6553	out_error:
				6554	if (!op_64_bit)
				6555	ret = (u32)ret;
				6556	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
				6557
				6558	out:
				6559	++vcpu->stat.hypercalls;
				6560	return kvm_skip_emulated_instruction(vcpu);
				6561	}
				6562	EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
				6563
				6564	static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
				6565	{
				6566	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6567	char instruction[3];
				6568	unsigned long rip = kvm_rip_read(vcpu);
				6569
				6570	kvm_x86_ops->patch_hypercall(vcpu, instruction);
				6571
				6572	return emulator_write_emulated(ctxt, rip, instruction, 3,
				6573	&ctxt->exception);
				6574	}
				6575
				6576	static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
				6577	{
				6578	return vcpu->run->request_interrupt_window &&
				6579	likely(!pic_in_kernel(vcpu->kvm));
				6580	}
				6581
				6582	static void post_kvm_run_save(struct kvm_vcpu *vcpu)
				6583	{
				6584	struct kvm_run *kvm_run = vcpu->run;
				6585
				6586	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
				6587	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
				6588	kvm_run->cr8 = kvm_get_cr8(vcpu);
				6589	kvm_run->apic_base = kvm_get_apic_base(vcpu);
				6590	kvm_run->ready_for_interrupt_injection =
				6591	pic_in_kernel(vcpu->kvm) \|\|
				6592	kvm_vcpu_ready_for_interrupt_injection(vcpu);
				6593	}
				6594
				6595	static void update_cr8_intercept(struct kvm_vcpu *vcpu)
				6596	{
				6597	int max_irr, tpr;
				6598
				6599	if (!kvm_x86_ops->update_cr8_intercept)
				6600	return;
				6601
				6602	if (!lapic_in_kernel(vcpu))
				6603	return;
				6604
				6605	if (vcpu->arch.apicv_active)
				6606	return;
				6607
				6608	if (!vcpu->arch.apic->vapic_addr)
				6609	max_irr = kvm_lapic_find_highest_irr(vcpu);
				6610	else
				6611	max_irr = -1;
				6612
				6613	if (max_irr != -1)
				6614	max_irr >>= 4;
				6615
				6616	tpr = kvm_lapic_get_cr8(vcpu);
				6617
				6618	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
				6619	}
				6620
				6621	static int inject_pending_event(struct kvm_vcpu *vcpu)
				6622	{
				6623	int r;
				6624
				6625	/* try to reinject previous events if any */
				6626	if (vcpu->arch.exception.injected) {
				6627	kvm_x86_ops->queue_exception(vcpu);
				6628	return 0;
				6629	}
				6630
				6631	/*
				6632	* Exceptions must be injected immediately, or the exception
				6633	* frame will have the address of the NMI or interrupt handler.
				6634	*/
				6635	if (!vcpu->arch.exception.pending) {
				6636	if (vcpu->arch.nmi_injected) {
				6637	kvm_x86_ops->set_nmi(vcpu);
				6638	return 0;
				6639	}
				6640
				6641	if (vcpu->arch.interrupt.pending) {
				6642	kvm_x86_ops->set_irq(vcpu);
				6643	return 0;
				6644	}
				6645	}
				6646
				6647	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
				6648	r = kvm_x86_ops->check_nested_events(vcpu);
				6649	if (r != 0)
				6650	return r;
				6651	}
				6652
				6653	/* try to inject new event if pending */
				6654	if (vcpu->arch.exception.pending) {
				6655	trace_kvm_inj_exception(vcpu->arch.exception.nr,
				6656	vcpu->arch.exception.has_error_code,
				6657	vcpu->arch.exception.error_code);
				6658
				6659	vcpu->arch.exception.pending = false;
				6660	vcpu->arch.exception.injected = true;
				6661
				6662	if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
				6663	__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) \|
				6664	X86_EFLAGS_RF);
				6665
				6666	if (vcpu->arch.exception.nr == DB_VECTOR &&
				6667	(vcpu->arch.dr7 & DR7_GD)) {
				6668	vcpu->arch.dr7 &= ~DR7_GD;
				6669	kvm_update_dr7(vcpu);
				6670	}
				6671
				6672	kvm_x86_ops->queue_exception(vcpu);
				6673	} else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
				6674	vcpu->arch.smi_pending = false;
				6675	enter_smm(vcpu);
				6676	} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
				6677	--vcpu->arch.nmi_pending;
				6678	vcpu->arch.nmi_injected = true;
				6679	kvm_x86_ops->set_nmi(vcpu);
				6680	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
				6681	/*
				6682	* Because interrupts can be injected asynchronously, we are
				6683	* calling check_nested_events again here to avoid a race condition.
				6684	* See https://lkml.org/lkml/2014/7/2/60 for discussion about this
				6685	* proposal and current concerns. Perhaps we should be setting
				6686	* KVM_REQ_EVENT only on certain events and not unconditionally?
				6687	*/
				6688	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
				6689	r = kvm_x86_ops->check_nested_events(vcpu);
				6690	if (r != 0)
				6691	return r;
				6692	}
				6693	if (kvm_x86_ops->interrupt_allowed(vcpu)) {
				6694	kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
				6695	false);
				6696	kvm_x86_ops->set_irq(vcpu);
				6697	}
				6698	}
				6699
				6700	return 0;
				6701	}
				6702
				6703	static void process_nmi(struct kvm_vcpu *vcpu)
				6704	{
				6705	unsigned limit = 2;
				6706
				6707	/*
				6708	* x86 is limited to one NMI running, and one NMI pending after it.
				6709	* If an NMI is already in progress, limit further NMIs to just one.
				6710	* Otherwise, allow two (and we'll inject the first one immediately).
				6711	*/
				6712	if (kvm_x86_ops->get_nmi_mask(vcpu) \|\| vcpu->arch.nmi_injected)
				6713	limit = 1;
				6714
				6715	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
				6716	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
				6717	kvm_make_request(KVM_REQ_EVENT, vcpu);
				6718	}
				6719
				6720	#define put_smstate(type, buf, offset, val) \
				6721	(type )((buf) + (offset) - 0x7e00) = val
				6722
				6723	static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
				6724	{
				6725	u32 flags = 0;
				6726	flags \|= seg->g << 23;
				6727	flags \|= seg->db << 22;
				6728	flags \|= seg->l << 21;
				6729	flags \|= seg->avl << 20;
				6730	flags \|= seg->present << 15;
				6731	flags \|= seg->dpl << 13;
				6732	flags \|= seg->s << 12;
				6733	flags \|= seg->type << 8;
				6734	return flags;
				6735	}
				6736
				6737	static void enter_smm_save_seg_32(struct kvm_vcpu vcpu, char buf, int n)
				6738	{
				6739	struct kvm_segment seg;
				6740	int offset;
				6741
				6742	kvm_get_segment(vcpu, &seg, n);
				6743	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
				6744
				6745	if (n < 3)
				6746	offset = 0x7f84 + n * 12;
				6747	else
				6748	offset = 0x7f2c + (n - 3) * 12;
				6749
				6750	put_smstate(u32, buf, offset + 8, seg.base);
				6751	put_smstate(u32, buf, offset + 4, seg.limit);
				6752	put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
				6753	}
				6754
				6755	#ifdef CONFIG_X86_64
				6756	static void enter_smm_save_seg_64(struct kvm_vcpu vcpu, char buf, int n)
				6757	{
				6758	struct kvm_segment seg;
				6759	int offset;
				6760	u16 flags;
				6761
				6762	kvm_get_segment(vcpu, &seg, n);
				6763	offset = 0x7e00 + n * 16;
				6764
				6765	flags = enter_smm_get_segment_flags(&seg) >> 8;
				6766	put_smstate(u16, buf, offset, seg.selector);
				6767	put_smstate(u16, buf, offset + 2, flags);
				6768	put_smstate(u32, buf, offset + 4, seg.limit);
				6769	put_smstate(u64, buf, offset + 8, seg.base);
				6770	}
				6771	#endif
				6772
				6773	static void enter_smm_save_state_32(struct kvm_vcpu vcpu, char buf)
				6774	{
				6775	struct desc_ptr dt;
				6776	struct kvm_segment seg;
				6777	unsigned long val;
				6778	int i;
				6779
				6780	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
				6781	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
				6782	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
				6783	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
				6784
				6785	for (i = 0; i < 8; i++)
				6786	put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
				6787
				6788	kvm_get_dr(vcpu, 6, &val);
				6789	put_smstate(u32, buf, 0x7fcc, (u32)val);
				6790	kvm_get_dr(vcpu, 7, &val);
				6791	put_smstate(u32, buf, 0x7fc8, (u32)val);
				6792
				6793	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
				6794	put_smstate(u32, buf, 0x7fc4, seg.selector);
				6795	put_smstate(u32, buf, 0x7f64, seg.base);
				6796	put_smstate(u32, buf, 0x7f60, seg.limit);
				6797	put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
				6798
				6799	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
				6800	put_smstate(u32, buf, 0x7fc0, seg.selector);
				6801	put_smstate(u32, buf, 0x7f80, seg.base);
				6802	put_smstate(u32, buf, 0x7f7c, seg.limit);
				6803	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
				6804
				6805	kvm_x86_ops->get_gdt(vcpu, &dt);
				6806	put_smstate(u32, buf, 0x7f74, dt.address);
				6807	put_smstate(u32, buf, 0x7f70, dt.size);
				6808
				6809	kvm_x86_ops->get_idt(vcpu, &dt);
				6810	put_smstate(u32, buf, 0x7f58, dt.address);
				6811	put_smstate(u32, buf, 0x7f54, dt.size);
				6812
				6813	for (i = 0; i < 6; i++)
				6814	enter_smm_save_seg_32(vcpu, buf, i);
				6815
				6816	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
				6817
				6818	/* revision id */
				6819	put_smstate(u32, buf, 0x7efc, 0x00020000);
				6820	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
				6821	}
				6822
				6823	static void enter_smm_save_state_64(struct kvm_vcpu vcpu, char buf)
				6824	{
				6825	#ifdef CONFIG_X86_64
				6826	struct desc_ptr dt;
				6827	struct kvm_segment seg;
				6828	unsigned long val;
				6829	int i;
				6830
				6831	for (i = 0; i < 16; i++)
				6832	put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
				6833
				6834	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
				6835	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
				6836
				6837	kvm_get_dr(vcpu, 6, &val);
				6838	put_smstate(u64, buf, 0x7f68, val);
				6839	kvm_get_dr(vcpu, 7, &val);
				6840	put_smstate(u64, buf, 0x7f60, val);
				6841
				6842	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
				6843	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
				6844	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
				6845
				6846	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
				6847
				6848	/* revision id */
				6849	put_smstate(u32, buf, 0x7efc, 0x00020064);
				6850
				6851	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
				6852
				6853	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
				6854	put_smstate(u16, buf, 0x7e90, seg.selector);
				6855	put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
				6856	put_smstate(u32, buf, 0x7e94, seg.limit);
				6857	put_smstate(u64, buf, 0x7e98, seg.base);
				6858
				6859	kvm_x86_ops->get_idt(vcpu, &dt);
				6860	put_smstate(u32, buf, 0x7e84, dt.size);
				6861	put_smstate(u64, buf, 0x7e88, dt.address);
				6862
				6863	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
				6864	put_smstate(u16, buf, 0x7e70, seg.selector);
				6865	put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
				6866	put_smstate(u32, buf, 0x7e74, seg.limit);
				6867	put_smstate(u64, buf, 0x7e78, seg.base);
				6868
				6869	kvm_x86_ops->get_gdt(vcpu, &dt);
				6870	put_smstate(u32, buf, 0x7e64, dt.size);
				6871	put_smstate(u64, buf, 0x7e68, dt.address);
				6872
				6873	for (i = 0; i < 6; i++)
				6874	enter_smm_save_seg_64(vcpu, buf, i);
				6875	#else
				6876	WARN_ON_ONCE(1);
				6877	#endif
				6878	}
				6879
				6880	static void enter_smm(struct kvm_vcpu *vcpu)
				6881	{
				6882	struct kvm_segment cs, ds;
				6883	struct desc_ptr dt;
				6884	char buf[512];
				6885	u32 cr0;
				6886
				6887	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
				6888	vcpu->arch.hflags \|= HF_SMM_MASK;
				6889	memset(buf, 0, 512);
				6890	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
				6891	enter_smm_save_state_64(vcpu, buf);
				6892	else
				6893	enter_smm_save_state_32(vcpu, buf);
				6894
				6895	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
				6896
				6897	if (kvm_x86_ops->get_nmi_mask(vcpu))
				6898	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
				6899	else
				6900	kvm_x86_ops->set_nmi_mask(vcpu, true);
				6901
				6902	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
				6903	kvm_rip_write(vcpu, 0x8000);
				6904
				6905	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE \| X86_CR0_EM \| X86_CR0_TS \| X86_CR0_PG);
				6906	kvm_x86_ops->set_cr0(vcpu, cr0);
				6907	vcpu->arch.cr0 = cr0;
				6908
				6909	kvm_x86_ops->set_cr4(vcpu, 0);
				6910
				6911	/* Undocumented: IDT limit is set to zero on entry to SMM. */
				6912	dt.address = dt.size = 0;
				6913	kvm_x86_ops->set_idt(vcpu, &dt);
				6914
				6915	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
				6916
				6917	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
				6918	cs.base = vcpu->arch.smbase;
				6919
				6920	ds.selector = 0;
				6921	ds.base = 0;
				6922
				6923	cs.limit = ds.limit = 0xffffffff;
				6924	cs.type = ds.type = 0x3;
				6925	cs.dpl = ds.dpl = 0;
				6926	cs.db = ds.db = 0;
				6927	cs.s = ds.s = 1;
				6928	cs.l = ds.l = 0;
				6929	cs.g = ds.g = 1;
				6930	cs.avl = ds.avl = 0;
				6931	cs.present = ds.present = 1;
				6932	cs.unusable = ds.unusable = 0;
				6933	cs.padding = ds.padding = 0;
				6934
				6935	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
				6936	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
				6937	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
				6938	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
				6939	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
				6940	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
				6941
				6942	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
				6943	kvm_x86_ops->set_efer(vcpu, 0);
				6944
				6945	kvm_update_cpuid(vcpu);
				6946	kvm_mmu_reset_context(vcpu);
				6947	}
				6948
				6949	static void process_smi(struct kvm_vcpu *vcpu)
				6950	{
				6951	vcpu->arch.smi_pending = true;
				6952	kvm_make_request(KVM_REQ_EVENT, vcpu);
				6953	}
				6954
				6955	void kvm_make_scan_ioapic_request(struct kvm *kvm)
				6956	{
				6957	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
				6958	}
				6959
				6960	static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
				6961	{
				6962	u64 eoi_exit_bitmap[4];
				6963
				6964	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
				6965	return;
				6966
				6967	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
				6968
				6969	if (irqchip_split(vcpu->kvm))
				6970	kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
				6971	else {
				6972	if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
				6973	kvm_x86_ops->sync_pir_to_irr(vcpu);
				6974	if (ioapic_in_kernel(vcpu->kvm))
				6975	kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
				6976	}
				6977	bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
				6978	vcpu_to_synic(vcpu)->vec_bitmap, 256);
				6979	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
				6980	}
				6981
				6982	static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
				6983	{
				6984	++vcpu->stat.tlb_flush;
				6985	kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
				6986	}
				6987
				6988	void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
				6989	unsigned long start, unsigned long end)
				6990	{
				6991	unsigned long apic_address;
				6992
				6993	/*
				6994	* The physical address of apic access page is stored in the VMCS.
				6995	* Update it when it becomes invalid.
				6996	*/
				6997	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
				6998	if (start <= apic_address && apic_address < end)
				6999	kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
				7000	}
				7001
				7002	void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
				7003	{
				7004	struct page *page = NULL;
				7005
				7006	if (!lapic_in_kernel(vcpu))
				7007	return;
				7008
				7009	if (!kvm_x86_ops->set_apic_access_page_addr)
				7010	return;
				7011
				7012	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
				7013	if (is_error_page(page))
				7014	return;
				7015	kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
				7016
				7017	/*
				7018	* Do not pin apic access page in memory, the MMU notifier
				7019	* will call us again if it is migrated or swapped out.
				7020	*/
				7021	put_page(page);
				7022	}
				7023	EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
				7024
				7025	/*
				7026	* Returns 1 to let vcpu_run() continue the guest execution loop without
				7027	* exiting to the userspace. Otherwise, the value will be returned to the
				7028	* userspace.
				7029	*/
				7030	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
				7031	{
				7032	int r;
				7033	bool req_int_win =
				7034	dm_request_for_irq_injection(vcpu) &&
				7035	kvm_cpu_accept_dm_intr(vcpu);
				7036
				7037	bool req_immediate_exit = false;
				7038
				7039	if (kvm_request_pending(vcpu)) {
				7040	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
				7041	kvm_mmu_unload(vcpu);
				7042	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
				7043	__kvm_migrate_timers(vcpu);
				7044	if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
				7045	kvm_gen_update_masterclock(vcpu->kvm);
				7046	if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
				7047	kvm_gen_kvmclock_update(vcpu);
				7048	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
				7049	r = kvm_guest_time_update(vcpu);
				7050	if (unlikely(r))
				7051	goto out;
				7052	}
				7053	if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
				7054	kvm_mmu_sync_roots(vcpu);
				7055	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
				7056	kvm_vcpu_flush_tlb(vcpu, true);
				7057	if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
				7058	vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
				7059	r = 0;
				7060	goto out;
				7061	}
				7062	if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
				7063	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
				7064	vcpu->mmio_needed = 0;
				7065	r = 0;
				7066	goto out;
				7067	}
				7068	if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
				7069	/* Page is swapped out. Do synthetic halt */
				7070	vcpu->arch.apf.halted = true;
				7071	r = 1;
				7072	goto out;
				7073	}
				7074	if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
				7075	record_steal_time(vcpu);
				7076	if (kvm_check_request(KVM_REQ_SMI, vcpu))
				7077	process_smi(vcpu);
				7078	if (kvm_check_request(KVM_REQ_NMI, vcpu))
				7079	process_nmi(vcpu);
				7080	if (kvm_check_request(KVM_REQ_PMU, vcpu))
				7081	kvm_pmu_handle_event(vcpu);
				7082	if (kvm_check_request(KVM_REQ_PMI, vcpu))
				7083	kvm_pmu_deliver_pmi(vcpu);
				7084	if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
				7085	BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
				7086	if (test_bit(vcpu->arch.pending_ioapic_eoi,
				7087	vcpu->arch.ioapic_handled_vectors)) {
				7088	vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
				7089	vcpu->run->eoi.vector =
				7090	vcpu->arch.pending_ioapic_eoi;
				7091	r = 0;
				7092	goto out;
				7093	}
				7094	}
				7095	if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
				7096	vcpu_scan_ioapic(vcpu);
				7097	if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
				7098	kvm_vcpu_reload_apic_access_page(vcpu);
				7099	if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
				7100	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
				7101	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
				7102	r = 0;
				7103	goto out;
				7104	}
				7105	if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
				7106	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
				7107	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
				7108	r = 0;
				7109	goto out;
				7110	}
				7111	if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
				7112	vcpu->run->exit_reason = KVM_EXIT_HYPERV;
				7113	vcpu->run->hyperv = vcpu->arch.hyperv.exit;
				7114	r = 0;
				7115	goto out;
				7116	}
				7117
				7118	/*
				7119	* KVM_REQ_HV_STIMER has to be processed after
				7120	* KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
				7121	* depend on the guest clock being up-to-date
				7122	*/
				7123	if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
				7124	kvm_hv_process_stimers(vcpu);
				7125	}
				7126
				7127	if (kvm_check_request(KVM_REQ_EVENT, vcpu) \|\| req_int_win) {
				7128	++vcpu->stat.req_event;
				7129	kvm_apic_accept_events(vcpu);
				7130	if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
				7131	r = 1;
				7132	goto out;
				7133	}
				7134
				7135	if (inject_pending_event(vcpu) != 0)
				7136	req_immediate_exit = true;
				7137	else {
				7138	/* Enable NMI/IRQ window open exits if needed.
				7139	*
				7140	* SMIs have two cases: 1) they can be nested, and
				7141	* then there is nothing to do here because RSM will
				7142	* cause a vmexit anyway; 2) or the SMI can be pending
				7143	* because inject_pending_event has completed the
				7144	* injection of an IRQ or NMI from the previous vmexit,
				7145	* and then we request an immediate exit to inject the SMI.
				7146	*/
				7147	if (vcpu->arch.smi_pending && !is_smm(vcpu))
				7148	req_immediate_exit = true;
				7149	if (vcpu->arch.nmi_pending)
				7150	kvm_x86_ops->enable_nmi_window(vcpu);
				7151	if (kvm_cpu_has_injectable_intr(vcpu) \|\| req_int_win)
				7152	kvm_x86_ops->enable_irq_window(vcpu);
				7153	WARN_ON(vcpu->arch.exception.pending);
				7154	}
				7155
				7156	if (kvm_lapic_enabled(vcpu)) {
				7157	update_cr8_intercept(vcpu);
				7158	kvm_lapic_sync_to_vapic(vcpu);
				7159	}
				7160	}
				7161
				7162	r = kvm_mmu_reload(vcpu);
				7163	if (unlikely(r)) {
				7164	goto cancel_injection;
				7165	}
				7166
				7167	preempt_disable();
				7168
				7169	kvm_x86_ops->prepare_guest_switch(vcpu);
				7170
				7171	/*
				7172	* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
				7173	* IPI are then delayed after guest entry, which ensures that they
				7174	* result in virtual interrupt delivery.
				7175	*/
				7176	local_irq_disable();
				7177	vcpu->mode = IN_GUEST_MODE;
				7178
				7179	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
				7180
				7181	/*
				7182	* 1) We should set ->mode before checking ->requests. Please see
				7183	* the comment in kvm_vcpu_exiting_guest_mode().
				7184	*
				7185	* 2) For APICv, we should set ->mode before checking PIR.ON. This
				7186	* pairs with the memory barrier implicit in pi_test_and_set_on
				7187	* (see vmx_deliver_posted_interrupt).
				7188	*
				7189	* 3) This also orders the write to mode from any reads to the page
				7190	* tables done while the VCPU is running. Please see the comment
				7191	* in kvm_flush_remote_tlbs.
				7192	*/
				7193	smp_mb__after_srcu_read_unlock();
				7194
				7195	/*
				7196	* This handles the case where a posted interrupt was
				7197	* notified with kvm_vcpu_kick.
				7198	*/
				7199	if (kvm_lapic_enabled(vcpu)) {
				7200	if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
				7201	kvm_x86_ops->sync_pir_to_irr(vcpu);
				7202	}
				7203
				7204	if (vcpu->mode == EXITING_GUEST_MODE \|\| kvm_request_pending(vcpu)
				7205	\|\| need_resched() \|\| signal_pending(current)) {
				7206	vcpu->mode = OUTSIDE_GUEST_MODE;
				7207	smp_wmb();
				7208	local_irq_enable();
				7209	preempt_enable();
				7210	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
				7211	r = 1;
				7212	goto cancel_injection;
				7213	}
				7214
				7215	kvm_load_guest_xcr0(vcpu);
				7216
				7217	if (req_immediate_exit) {
				7218	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7219	smp_send_reschedule(vcpu->cpu);
				7220	}
				7221
				7222	trace_kvm_entry(vcpu->vcpu_id);
				7223	wait_lapic_expire(vcpu);
				7224	guest_enter_irqoff();
				7225
				7226	if (unlikely(vcpu->arch.switch_db_regs)) {
				7227	set_debugreg(0, 7);
				7228	set_debugreg(vcpu->arch.eff_db[0], 0);
				7229	set_debugreg(vcpu->arch.eff_db[1], 1);
				7230	set_debugreg(vcpu->arch.eff_db[2], 2);
				7231	set_debugreg(vcpu->arch.eff_db[3], 3);
				7232	set_debugreg(vcpu->arch.dr6, 6);
				7233	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
				7234	}
				7235
				7236	kvm_x86_ops->run(vcpu);
				7237
				7238	/*
				7239	* Do this here before restoring debug registers on the host. And
				7240	* since we do this before handling the vmexit, a DR access vmexit
				7241	* can (a) read the correct value of the debug registers, (b) set
				7242	* KVM_DEBUGREG_WONT_EXIT again.
				7243	*/
				7244	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
				7245	WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
				7246	kvm_x86_ops->sync_dirty_debug_regs(vcpu);
				7247	kvm_update_dr0123(vcpu);
				7248	kvm_update_dr6(vcpu);
				7249	kvm_update_dr7(vcpu);
				7250	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
				7251	}
				7252
				7253	/*
				7254	* If the guest has used debug registers, at least dr7
				7255	* will be disabled while returning to the host.
				7256	* If we don't have active breakpoints in the host, we don't
				7257	* care about the messed up debug address registers. But if
				7258	* we have some of them active, restore the old state.
				7259	*/
				7260	if (hw_breakpoint_active())
				7261	hw_breakpoint_restore();
				7262
				7263	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
				7264
				7265	vcpu->mode = OUTSIDE_GUEST_MODE;
				7266	smp_wmb();
				7267
				7268	kvm_put_guest_xcr0(vcpu);
				7269
				7270	kvm_x86_ops->handle_external_intr(vcpu);
				7271
				7272	++vcpu->stat.exits;
				7273
				7274	guest_exit_irqoff();
				7275
				7276	local_irq_enable();
				7277	preempt_enable();
				7278
				7279	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
				7280
				7281	/*
				7282	* Profile KVM exit RIPs:
				7283	*/
				7284	if (unlikely(prof_on == KVM_PROFILING)) {
				7285	unsigned long rip = kvm_rip_read(vcpu);
				7286	profile_hit(KVM_PROFILING, (void *)rip);
				7287	}
				7288
				7289	if (unlikely(vcpu->arch.tsc_always_catchup))
				7290	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				7291
				7292	if (vcpu->arch.apic_attention)
				7293	kvm_lapic_sync_from_vapic(vcpu);
				7294
				7295	vcpu->arch.gpa_available = false;
				7296	r = kvm_x86_ops->handle_exit(vcpu);
				7297	return r;
				7298
				7299	cancel_injection:
				7300	kvm_x86_ops->cancel_injection(vcpu);
				7301	if (unlikely(vcpu->arch.apic_attention))
				7302	kvm_lapic_sync_from_vapic(vcpu);
				7303	out:
				7304	return r;
				7305	}
				7306
				7307	static inline int vcpu_block(struct kvm kvm, struct kvm_vcpu vcpu)
				7308	{
				7309	if (!kvm_arch_vcpu_runnable(vcpu) &&
				7310	(!kvm_x86_ops->pre_block \|\| kvm_x86_ops->pre_block(vcpu) == 0)) {
				7311	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
				7312	kvm_vcpu_block(vcpu);
				7313	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
				7314
				7315	if (kvm_x86_ops->post_block)
				7316	kvm_x86_ops->post_block(vcpu);
				7317
				7318	if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
				7319	return 1;
				7320	}
				7321
				7322	kvm_apic_accept_events(vcpu);
				7323	switch(vcpu->arch.mp_state) {
				7324	case KVM_MP_STATE_HALTED:
				7325	vcpu->arch.pv.pv_unhalted = false;
				7326	vcpu->arch.mp_state =
				7327	KVM_MP_STATE_RUNNABLE;
				7328	case KVM_MP_STATE_RUNNABLE:
				7329	vcpu->arch.apf.halted = false;
				7330	break;
				7331	case KVM_MP_STATE_INIT_RECEIVED:
				7332	break;
				7333	default:
				7334	return -EINTR;
				7335	break;
				7336	}
				7337	return 1;
				7338	}
				7339
				7340	static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
				7341	{
				7342	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
				7343	kvm_x86_ops->check_nested_events(vcpu);
				7344
				7345	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
				7346	!vcpu->arch.apf.halted);
				7347	}
				7348
				7349	static int vcpu_run(struct kvm_vcpu *vcpu)
				7350	{
				7351	int r;
				7352	struct kvm *kvm = vcpu->kvm;
				7353
				7354	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
				7355	vcpu->arch.l1tf_flush_l1d = true;
				7356
				7357	for (;;) {
				7358	if (kvm_vcpu_running(vcpu)) {
				7359	r = vcpu_enter_guest(vcpu);
				7360	} else {
				7361	r = vcpu_block(kvm, vcpu);
				7362	}
				7363
				7364	if (r <= 0)
				7365	break;
				7366
				7367	kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
				7368	if (kvm_cpu_has_pending_timer(vcpu))
				7369	kvm_inject_pending_timer_irqs(vcpu);
				7370
				7371	if (dm_request_for_irq_injection(vcpu) &&
				7372	kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
				7373	r = 0;
				7374	vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
				7375	++vcpu->stat.request_irq_exits;
				7376	break;
				7377	}
				7378
				7379	kvm_check_async_pf_completion(vcpu);
				7380
				7381	if (signal_pending(current)) {
				7382	r = -EINTR;
				7383	vcpu->run->exit_reason = KVM_EXIT_INTR;
				7384	++vcpu->stat.signal_exits;
				7385	break;
				7386	}
				7387	if (need_resched()) {
				7388	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
				7389	cond_resched();
				7390	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
				7391	}
				7392	}
				7393
				7394	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
				7395
				7396	return r;
				7397	}
				7398
				7399	static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
				7400	{
				7401	int r;
				7402	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
				7403	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
				7404	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
				7405	if (r != EMULATE_DONE)
				7406	return 0;
				7407	return 1;
				7408	}
				7409
				7410	static int complete_emulated_pio(struct kvm_vcpu *vcpu)
				7411	{
				7412	BUG_ON(!vcpu->arch.pio.count);
				7413
				7414	return complete_emulated_io(vcpu);
				7415	}
				7416
				7417	/*
				7418	* Implements the following, as a state machine:
				7419	*
				7420	* read:
				7421	* for each fragment
				7422	* for each mmio piece in the fragment
				7423	* write gpa, len
				7424	* exit
				7425	* copy data
				7426	* execute insn
				7427	*
				7428	* write:
				7429	* for each fragment
				7430	* for each mmio piece in the fragment
				7431	* write gpa, len
				7432	* copy data
				7433	* exit
				7434	*/
				7435	static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
				7436	{
				7437	struct kvm_run *run = vcpu->run;
				7438	struct kvm_mmio_fragment *frag;
				7439	unsigned len;
				7440
				7441	BUG_ON(!vcpu->mmio_needed);
				7442
				7443	/* Complete previous fragment */
				7444	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
				7445	len = min(8u, frag->len);
				7446	if (!vcpu->mmio_is_write)
				7447	memcpy(frag->data, run->mmio.data, len);
				7448
				7449	if (frag->len <= 8) {
				7450	/* Switch to the next fragment. */
				7451	frag++;
				7452	vcpu->mmio_cur_fragment++;
				7453	} else {
				7454	/* Go forward to the next mmio piece. */
				7455	frag->data += len;
				7456	frag->gpa += len;
				7457	frag->len -= len;
				7458	}
				7459
				7460	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
				7461	vcpu->mmio_needed = 0;
				7462
				7463	/* FIXME: return into emulator if single-stepping. */
				7464	if (vcpu->mmio_is_write)
				7465	return 1;
				7466	vcpu->mmio_read_completed = 1;
				7467	return complete_emulated_io(vcpu);
				7468	}
				7469
				7470	run->exit_reason = KVM_EXIT_MMIO;
				7471	run->mmio.phys_addr = frag->gpa;
				7472	if (vcpu->mmio_is_write)
				7473	memcpy(run->mmio.data, frag->data, min(8u, frag->len));
				7474	run->mmio.len = min(8u, frag->len);
				7475	run->mmio.is_write = vcpu->mmio_is_write;
				7476	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
				7477	return 0;
				7478	}
				7479
				7480
				7481	int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu vcpu, struct kvm_run kvm_run)
				7482	{
				7483	int r;
				7484
				7485	kvm_sigset_activate(vcpu);
				7486
				7487	kvm_load_guest_fpu(vcpu);
				7488
				7489	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
				7490	if (kvm_run->immediate_exit) {
				7491	r = -EINTR;
				7492	goto out;
				7493	}
				7494	kvm_vcpu_block(vcpu);
				7495	kvm_apic_accept_events(vcpu);
				7496	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
				7497	r = -EAGAIN;
				7498	if (signal_pending(current)) {
				7499	r = -EINTR;
				7500	vcpu->run->exit_reason = KVM_EXIT_INTR;
				7501	++vcpu->stat.signal_exits;
				7502	}
				7503	goto out;
				7504	}
				7505
				7506	/* re-sync apic's tpr */
				7507	if (!lapic_in_kernel(vcpu)) {
				7508	if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
				7509	r = -EINVAL;
				7510	goto out;
				7511	}
				7512	}
				7513
				7514	if (unlikely(vcpu->arch.complete_userspace_io)) {
				7515	int (cui)(struct kvm_vcpu ) = vcpu->arch.complete_userspace_io;
				7516	vcpu->arch.complete_userspace_io = NULL;
				7517	r = cui(vcpu);
				7518	if (r <= 0)
				7519	goto out;
				7520	} else
				7521	WARN_ON(vcpu->arch.pio.count \|\| vcpu->mmio_needed);
				7522
				7523	if (kvm_run->immediate_exit)
				7524	r = -EINTR;
				7525	else
				7526	r = vcpu_run(vcpu);
				7527
				7528	out:
				7529	kvm_put_guest_fpu(vcpu);
				7530	post_kvm_run_save(vcpu);
				7531	kvm_sigset_deactivate(vcpu);
				7532
				7533	return r;
				7534	}
				7535
				7536	int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu vcpu, struct kvm_regs regs)
				7537	{
				7538	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
				7539	/*
				7540	* We are here if userspace calls get_regs() in the middle of
				7541	* instruction emulation. Registers state needs to be copied
				7542	* back from emulation context to vcpu. Userspace shouldn't do
				7543	* that usually, but some bad designed PV devices (vmware
				7544	* backdoor interface) need this to work
				7545	*/
				7546	emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
				7547	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
				7548	}
				7549	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
				7550	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
				7551	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
				7552	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
				7553	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
				7554	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
				7555	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
				7556	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
				7557	#ifdef CONFIG_X86_64
				7558	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
				7559	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
				7560	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
				7561	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
				7562	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
				7563	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
				7564	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
				7565	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
				7566	#endif
				7567
				7568	regs->rip = kvm_rip_read(vcpu);
				7569	regs->rflags = kvm_get_rflags(vcpu);
				7570
				7571	return 0;
				7572	}
				7573
				7574	int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu vcpu, struct kvm_regs regs)
				7575	{
				7576	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
				7577	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
				7578
				7579	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
				7580	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
				7581	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
				7582	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
				7583	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
				7584	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
				7585	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
				7586	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
				7587	#ifdef CONFIG_X86_64
				7588	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
				7589	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
				7590	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
				7591	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
				7592	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
				7593	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
				7594	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
				7595	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
				7596	#endif
				7597
				7598	kvm_rip_write(vcpu, regs->rip);
				7599	kvm_set_rflags(vcpu, regs->rflags \| X86_EFLAGS_FIXED);
				7600
				7601	vcpu->arch.exception.pending = false;
				7602
				7603	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7604
				7605	return 0;
				7606	}
				7607
				7608	void kvm_get_cs_db_l_bits(struct kvm_vcpu vcpu, int db, int *l)
				7609	{
				7610	struct kvm_segment cs;
				7611
				7612	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
				7613	*db = cs.db;
				7614	*l = cs.l;
				7615	}
				7616	EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
				7617
				7618	int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
				7619	struct kvm_sregs *sregs)
				7620	{
				7621	struct desc_ptr dt;
				7622
				7623	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
				7624	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
				7625	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
				7626	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
				7627	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
				7628	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
				7629
				7630	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
				7631	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
				7632
				7633	kvm_x86_ops->get_idt(vcpu, &dt);
				7634	sregs->idt.limit = dt.size;
				7635	sregs->idt.base = dt.address;
				7636	kvm_x86_ops->get_gdt(vcpu, &dt);
				7637	sregs->gdt.limit = dt.size;
				7638	sregs->gdt.base = dt.address;
				7639
				7640	sregs->cr0 = kvm_read_cr0(vcpu);
				7641	sregs->cr2 = vcpu->arch.cr2;
				7642	sregs->cr3 = kvm_read_cr3(vcpu);
				7643	sregs->cr4 = kvm_read_cr4(vcpu);
				7644	sregs->cr8 = kvm_get_cr8(vcpu);
				7645	sregs->efer = vcpu->arch.efer;
				7646	sregs->apic_base = kvm_get_apic_base(vcpu);
				7647
				7648	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
				7649
				7650	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
				7651	set_bit(vcpu->arch.interrupt.nr,
				7652	(unsigned long *)sregs->interrupt_bitmap);
				7653
				7654	return 0;
				7655	}
				7656
				7657	int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
				7658	struct kvm_mp_state *mp_state)
				7659	{
				7660	if (kvm_mpx_supported())
				7661	kvm_load_guest_fpu(vcpu);
				7662
				7663	kvm_apic_accept_events(vcpu);
				7664	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
				7665	vcpu->arch.pv.pv_unhalted)
				7666	mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
				7667	else
				7668	mp_state->mp_state = vcpu->arch.mp_state;
				7669
				7670	if (kvm_mpx_supported())
				7671	kvm_put_guest_fpu(vcpu);
				7672	return 0;
				7673	}
				7674
				7675	int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
				7676	struct kvm_mp_state *mp_state)
				7677	{
				7678	if (!lapic_in_kernel(vcpu) &&
				7679	mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
				7680	return -EINVAL;
				7681
				7682	/* INITs are latched while in SMM */
				7683	if ((is_smm(vcpu) \|\| vcpu->arch.smi_pending) &&
				7684	(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED \|\|
				7685	mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
				7686	return -EINVAL;
				7687
				7688	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
				7689	vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
				7690	set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
				7691	} else
				7692	vcpu->arch.mp_state = mp_state->mp_state;
				7693	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7694	return 0;
				7695	}
				7696
				7697	int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
				7698	int reason, bool has_error_code, u32 error_code)
				7699	{
				7700	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				7701	int ret;
				7702
				7703	init_emulate_ctxt(vcpu);
				7704
				7705	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
				7706	has_error_code, error_code);
				7707
				7708	if (ret)
				7709	return EMULATE_FAIL;
				7710
				7711	kvm_rip_write(vcpu, ctxt->eip);
				7712	kvm_set_rflags(vcpu, ctxt->eflags);
				7713	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7714	return EMULATE_DONE;
				7715	}
				7716	EXPORT_SYMBOL_GPL(kvm_task_switch);
				7717
				7718	int kvm_valid_sregs(struct kvm_vcpu vcpu, struct kvm_sregs sregs)
				7719	{
				7720	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
				7721	/*
				7722	* When EFER.LME and CR0.PG are set, the processor is in
				7723	* 64-bit mode (though maybe in a 32-bit code segment).
				7724	* CR4.PAE and EFER.LMA must be set.
				7725	*/
				7726	if (!(sregs->cr4 & X86_CR4_PAE)
				7727	\|\| !(sregs->efer & EFER_LMA))
				7728	return -EINVAL;
				7729	} else {
				7730	/*
				7731	* Not in 64-bit mode: EFER.LMA is clear and the code
				7732	* segment cannot be 64-bit.
				7733	*/
				7734	if (sregs->efer & EFER_LMA \|\| sregs->cs.l)
				7735	return -EINVAL;
				7736	}
				7737
				7738	return 0;
				7739	}
				7740
				7741	int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
				7742	struct kvm_sregs *sregs)
				7743	{
				7744	struct msr_data apic_base_msr;
				7745	int mmu_reset_needed = 0;
				7746	int cpuid_update_needed = 0;
				7747	int pending_vec, max_bits, idx;
				7748	struct desc_ptr dt;
				7749
				7750	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
				7751	(sregs->cr4 & X86_CR4_OSXSAVE))
				7752	return -EINVAL;
				7753
				7754	if (kvm_valid_sregs(vcpu, sregs))
				7755	return -EINVAL;
				7756
				7757	apic_base_msr.data = sregs->apic_base;
				7758	apic_base_msr.host_initiated = true;
				7759	if (kvm_set_apic_base(vcpu, &apic_base_msr))
				7760	return -EINVAL;
				7761
				7762	dt.size = sregs->idt.limit;
				7763	dt.address = sregs->idt.base;
				7764	kvm_x86_ops->set_idt(vcpu, &dt);
				7765	dt.size = sregs->gdt.limit;
				7766	dt.address = sregs->gdt.base;
				7767	kvm_x86_ops->set_gdt(vcpu, &dt);
				7768
				7769	vcpu->arch.cr2 = sregs->cr2;
				7770	mmu_reset_needed \|= kvm_read_cr3(vcpu) != sregs->cr3;
				7771	vcpu->arch.cr3 = sregs->cr3;
				7772	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
				7773
				7774	kvm_set_cr8(vcpu, sregs->cr8);
				7775
				7776	mmu_reset_needed \|= vcpu->arch.efer != sregs->efer;
				7777	kvm_x86_ops->set_efer(vcpu, sregs->efer);
				7778
				7779	mmu_reset_needed \|= kvm_read_cr0(vcpu) != sregs->cr0;
				7780	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
				7781	vcpu->arch.cr0 = sregs->cr0;
				7782
				7783	mmu_reset_needed \|= kvm_read_cr4(vcpu) != sregs->cr4;
				7784	cpuid_update_needed \|= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
				7785	(X86_CR4_OSXSAVE \| X86_CR4_PKE));
				7786	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
				7787	if (cpuid_update_needed)
				7788	kvm_update_cpuid(vcpu);
				7789
				7790	idx = srcu_read_lock(&vcpu->kvm->srcu);
				7791	if (is_pae_paging(vcpu)) {
				7792	load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
				7793	mmu_reset_needed = 1;
				7794	}
				7795	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				7796
				7797	if (mmu_reset_needed)
				7798	kvm_mmu_reset_context(vcpu);
				7799
				7800	max_bits = KVM_NR_INTERRUPTS;
				7801	pending_vec = find_first_bit(
				7802	(const unsigned long *)sregs->interrupt_bitmap, max_bits);
				7803	if (pending_vec < max_bits) {
				7804	kvm_queue_interrupt(vcpu, pending_vec, false);
				7805	pr_debug("Set back pending irq %d\n", pending_vec);
				7806	}
				7807
				7808	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
				7809	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
				7810	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
				7811	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
				7812	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
				7813	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
				7814
				7815	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
				7816	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
				7817
				7818	update_cr8_intercept(vcpu);
				7819
				7820	/* Older userspace won't unhalt the vcpu on reset. */
				7821	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
				7822	sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
				7823	!is_protmode(vcpu))
				7824	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
				7825
				7826	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7827
				7828	return 0;
				7829	}
				7830
				7831	int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
				7832	struct kvm_guest_debug *dbg)
				7833	{
				7834	unsigned long rflags;
				7835	int i, r;
				7836
				7837	if (dbg->control & (KVM_GUESTDBG_INJECT_DB \| KVM_GUESTDBG_INJECT_BP)) {
				7838	r = -EBUSY;
				7839	if (vcpu->arch.exception.pending)
				7840	goto out;
				7841	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
				7842	kvm_queue_exception(vcpu, DB_VECTOR);
				7843	else
				7844	kvm_queue_exception(vcpu, BP_VECTOR);
				7845	}
				7846
				7847	/*
				7848	* Read rflags as long as potentially injected trace flags are still
				7849	* filtered out.
				7850	*/
				7851	rflags = kvm_get_rflags(vcpu);
				7852
				7853	vcpu->guest_debug = dbg->control;
				7854	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
				7855	vcpu->guest_debug = 0;
				7856
				7857	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
				7858	for (i = 0; i < KVM_NR_DB_REGS; ++i)
				7859	vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
				7860	vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
				7861	} else {
				7862	for (i = 0; i < KVM_NR_DB_REGS; i++)
				7863	vcpu->arch.eff_db[i] = vcpu->arch.db[i];
				7864	}
				7865	kvm_update_dr7(vcpu);
				7866
				7867	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
				7868	vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
				7869	get_segment_base(vcpu, VCPU_SREG_CS);
				7870
				7871	/*
				7872	* Trigger an rflags update that will inject or remove the trace
				7873	* flags.
				7874	*/
				7875	kvm_set_rflags(vcpu, rflags);
				7876
				7877	kvm_x86_ops->update_bp_intercept(vcpu);
				7878
				7879	r = 0;
				7880
				7881	out:
				7882
				7883	return r;
				7884	}
				7885
				7886	/*
				7887	* Translate a guest virtual address to a guest physical address.
				7888	*/
				7889	int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
				7890	struct kvm_translation *tr)
				7891	{
				7892	unsigned long vaddr = tr->linear_address;
				7893	gpa_t gpa;
				7894	int idx;
				7895
				7896	idx = srcu_read_lock(&vcpu->kvm->srcu);
				7897	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
				7898	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				7899	tr->physical_address = gpa;
				7900	tr->valid = gpa != UNMAPPED_GVA;
				7901	tr->writeable = 1;
				7902	tr->usermode = 0;
				7903
				7904	return 0;
				7905	}
				7906
				7907	int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu vcpu, struct kvm_fpu fpu)
				7908	{
				7909	struct fxregs_state *fxsave =
				7910	&vcpu->arch.guest_fpu.state.fxsave;
				7911
				7912	memcpy(fpu->fpr, fxsave->st_space, 128);
				7913	fpu->fcw = fxsave->cwd;
				7914	fpu->fsw = fxsave->swd;
				7915	fpu->ftwx = fxsave->twd;
				7916	fpu->last_opcode = fxsave->fop;
				7917	fpu->last_ip = fxsave->rip;
				7918	fpu->last_dp = fxsave->rdp;
				7919	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
				7920
				7921	return 0;
				7922	}
				7923
				7924	int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu vcpu, struct kvm_fpu fpu)
				7925	{
				7926	struct fxregs_state *fxsave =
				7927	&vcpu->arch.guest_fpu.state.fxsave;
				7928
				7929	memcpy(fxsave->st_space, fpu->fpr, 128);
				7930	fxsave->cwd = fpu->fcw;
				7931	fxsave->swd = fpu->fsw;
				7932	fxsave->twd = fpu->ftwx;
				7933	fxsave->fop = fpu->last_opcode;
				7934	fxsave->rip = fpu->last_ip;
				7935	fxsave->rdp = fpu->last_dp;
				7936	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
				7937
				7938	return 0;
				7939	}
				7940
				7941	static void fx_init(struct kvm_vcpu *vcpu)
				7942	{
				7943	fpstate_init(&vcpu->arch.guest_fpu.state);
				7944	if (boot_cpu_has(X86_FEATURE_XSAVES))
				7945	vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
				7946	host_xcr0 \| XSTATE_COMPACTION_ENABLED;
				7947
				7948	/*
				7949	* Ensure guest xcr0 is valid for loading
				7950	*/
				7951	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
				7952
				7953	vcpu->arch.cr0 \|= X86_CR0_ET;
				7954	}
				7955
				7956	/* Swap (qemu) user FPU context for the guest FPU context. */
				7957	void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
				7958	{
				7959	preempt_disable();
				7960	copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
				7961	/* PKRU is separately restored in kvm_x86_ops->run. */
				7962	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
				7963	~XFEATURE_MASK_PKRU);
				7964	preempt_enable();
				7965	trace_kvm_fpu(1);
				7966	}
				7967
				7968	/* When vcpu_run ends, restore user space FPU context. */
				7969	void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
				7970	{
				7971	preempt_disable();
				7972	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
				7973	copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
				7974	preempt_enable();
				7975	++vcpu->stat.fpu_reload;
				7976	trace_kvm_fpu(0);
				7977	}
				7978
				7979	void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
				7980	{
				7981	void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
				7982
				7983	kvmclock_reset(vcpu);
				7984
				7985	kvm_x86_ops->vcpu_free(vcpu);
				7986	free_cpumask_var(wbinvd_dirty_mask);
				7987	}
				7988
				7989	struct kvm_vcpu kvm_arch_vcpu_create(struct kvm kvm,
				7990	unsigned int id)
				7991	{
				7992	struct kvm_vcpu *vcpu;
				7993
				7994	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
				7995	printk_once(KERN_WARNING
				7996	"kvm: SMP vm created on host with unstable TSC; "
				7997	"guest TSC will not be reliable\n");
				7998
				7999	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
				8000
				8001	return vcpu;
				8002	}
				8003
				8004	int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
				8005	{
				8006	int r;
				8007
				8008	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
				8009	kvm_vcpu_mtrr_init(vcpu);
				8010	r = vcpu_load(vcpu);
				8011	if (r)
				8012	return r;
				8013	kvm_vcpu_reset(vcpu, false);
				8014	kvm_mmu_setup(vcpu);
				8015	vcpu_put(vcpu);
				8016	return r;
				8017	}
				8018
				8019	void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
				8020	{
				8021	struct msr_data msr;
				8022	struct kvm *kvm = vcpu->kvm;
				8023
				8024	kvm_hv_vcpu_postcreate(vcpu);
				8025
				8026	if (vcpu_load(vcpu))
				8027	return;
				8028	msr.data = 0x0;
				8029	msr.index = MSR_IA32_TSC;
				8030	msr.host_initiated = true;
				8031	kvm_write_tsc(vcpu, &msr);
				8032	vcpu_put(vcpu);
				8033
				8034	if (!kvmclock_periodic_sync)
				8035	return;
				8036
				8037	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
				8038	KVMCLOCK_SYNC_PERIOD);
				8039	}
				8040
				8041	void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
				8042	{
				8043	int r;
				8044	vcpu->arch.apf.msr_val = 0;
				8045
				8046	r = vcpu_load(vcpu);
				8047	BUG_ON(r);
				8048	kvm_mmu_unload(vcpu);
				8049	vcpu_put(vcpu);
				8050
				8051	kvm_arch_vcpu_free(vcpu);
				8052	}
				8053
				8054	void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
				8055	{
				8056	kvm_lapic_reset(vcpu, init_event);
				8057
				8058	vcpu->arch.hflags = 0;
				8059
				8060	vcpu->arch.smi_pending = 0;
				8061	atomic_set(&vcpu->arch.nmi_queued, 0);
				8062	vcpu->arch.nmi_pending = 0;
				8063	vcpu->arch.nmi_injected = false;
				8064	kvm_clear_interrupt_queue(vcpu);
				8065	kvm_clear_exception_queue(vcpu);
				8066	vcpu->arch.exception.pending = false;
				8067
				8068	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
				8069	kvm_update_dr0123(vcpu);
				8070	vcpu->arch.dr6 = DR6_INIT;
				8071	kvm_update_dr6(vcpu);
				8072	vcpu->arch.dr7 = DR7_FIXED_1;
				8073	kvm_update_dr7(vcpu);
				8074
				8075	vcpu->arch.cr2 = 0;
				8076
				8077	kvm_make_request(KVM_REQ_EVENT, vcpu);
				8078	vcpu->arch.apf.msr_val = 0;
				8079	vcpu->arch.st.msr_val = 0;
				8080
				8081	kvmclock_reset(vcpu);
				8082
				8083	kvm_clear_async_pf_completion_queue(vcpu);
				8084	kvm_async_pf_hash_reset(vcpu);
				8085	vcpu->arch.apf.halted = false;
				8086
				8087	if (!init_event) {
				8088	kvm_pmu_reset(vcpu);
				8089	vcpu->arch.smbase = 0x30000;
				8090
				8091	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
				8092	vcpu->arch.msr_misc_features_enables = 0;
				8093	}
				8094
				8095	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
				8096	vcpu->arch.regs_avail = ~0;
				8097	vcpu->arch.regs_dirty = ~0;
				8098
				8099	kvm_x86_ops->vcpu_reset(vcpu, init_event);
				8100	}
				8101
				8102	void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
				8103	{
				8104	struct kvm_segment cs;
				8105
				8106	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
				8107	cs.selector = vector << 8;
				8108	cs.base = vector << 12;
				8109	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
				8110	kvm_rip_write(vcpu, 0);
				8111	}
				8112
				8113	int kvm_arch_hardware_enable(void)
				8114	{
				8115	struct kvm *kvm;
				8116	struct kvm_vcpu *vcpu;
				8117	int i;
				8118	int ret;
				8119	u64 local_tsc;
				8120	u64 max_tsc = 0;
				8121	bool stable, backwards_tsc = false;
				8122
				8123	kvm_shared_msr_cpu_online();
				8124	ret = kvm_x86_ops->hardware_enable();
				8125	if (ret != 0)
				8126	return ret;
				8127
				8128	local_tsc = rdtsc();
				8129	stable = !check_tsc_unstable();
				8130	list_for_each_entry(kvm, &vm_list, vm_list) {
				8131	kvm_for_each_vcpu(i, vcpu, kvm) {
				8132	if (!stable && vcpu->cpu == smp_processor_id())
				8133	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				8134	if (stable && vcpu->arch.last_host_tsc > local_tsc) {
				8135	backwards_tsc = true;
				8136	if (vcpu->arch.last_host_tsc > max_tsc)
				8137	max_tsc = vcpu->arch.last_host_tsc;
				8138	}
				8139	}
				8140	}
				8141
				8142	/*
				8143	* Sometimes, even reliable TSCs go backwards. This happens on
				8144	* platforms that reset TSC during suspend or hibernate actions, but
				8145	* maintain synchronization. We must compensate. Fortunately, we can
				8146	* detect that condition here, which happens early in CPU bringup,
				8147	* before any KVM threads can be running. Unfortunately, we can't
				8148	* bring the TSCs fully up to date with real time, as we aren't yet far
				8149	* enough into CPU bringup that we know how much real time has actually
				8150	* elapsed; our helper function, ktime_get_boot_ns() will be using boot
				8151	* variables that haven't been updated yet.
				8152	*
				8153	* So we simply find the maximum observed TSC above, then record the
				8154	* adjustment to TSC in each VCPU. When the VCPU later gets loaded,
				8155	* the adjustment will be applied. Note that we accumulate
				8156	* adjustments, in case multiple suspend cycles happen before some VCPU
				8157	* gets a chance to run again. In the event that no KVM threads get a
				8158	* chance to run, we will miss the entire elapsed period, as we'll have
				8159	* reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
				8160	* loose cycle time. This isn't too big a deal, since the loss will be
				8161	* uniform across all VCPUs (not to mention the scenario is extremely
				8162	* unlikely). It is possible that a second hibernate recovery happens
				8163	* much faster than a first, causing the observed TSC here to be
				8164	* smaller; this would require additional padding adjustment, which is
				8165	* why we set last_host_tsc to the local tsc observed here.
				8166	*
				8167	* N.B. - this code below runs only on platforms with reliable TSC,
				8168	* as that is the only way backwards_tsc is set above. Also note
				8169	* that this runs for ALL vcpus, which is not a bug; all VCPUs should
				8170	* have the same delta_cyc adjustment applied if backwards_tsc
				8171	* is detected. Note further, this adjustment is only done once,
				8172	* as we reset last_host_tsc on all VCPUs to stop this from being
				8173	* called multiple times (one for each physical CPU bringup).
				8174	*
				8175	* Platforms with unreliable TSCs don't have to deal with this, they
				8176	* will be compensated by the logic in vcpu_load, which sets the TSC to
				8177	* catchup mode. This will catchup all VCPUs to real time, but cannot
				8178	* guarantee that they stay in perfect synchronization.
				8179	*/
				8180	if (backwards_tsc) {
				8181	u64 delta_cyc = max_tsc - local_tsc;
				8182	list_for_each_entry(kvm, &vm_list, vm_list) {
				8183	kvm->arch.backwards_tsc_observed = true;
				8184	kvm_for_each_vcpu(i, vcpu, kvm) {
				8185	vcpu->arch.tsc_offset_adjustment += delta_cyc;
				8186	vcpu->arch.last_host_tsc = local_tsc;
				8187	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				8188	}
				8189
				8190	/*
				8191	* We have to disable TSC offset matching.. if you were
				8192	* booting a VM while issuing an S4 host suspend....
				8193	* you may have some problem. Solving this issue is
				8194	* left as an exercise to the reader.
				8195	*/
				8196	kvm->arch.last_tsc_nsec = 0;
				8197	kvm->arch.last_tsc_write = 0;
				8198	}
				8199
				8200	}
				8201	return 0;
				8202	}
				8203
				8204	void kvm_arch_hardware_disable(void)
				8205	{
				8206	kvm_x86_ops->hardware_disable();
				8207	drop_user_return_notifiers();
				8208	}
				8209
				8210	int kvm_arch_hardware_setup(void)
				8211	{
				8212	int r;
				8213
				8214	r = kvm_x86_ops->hardware_setup();
				8215	if (r != 0)
				8216	return r;
				8217
				8218	if (kvm_has_tsc_control) {
				8219	/*
				8220	* Make sure the user can only configure tsc_khz values that
				8221	* fit into a signed integer.
				8222	* A min value is not calculated needed because it will always
				8223	* be 1 on all machines.
				8224	*/
				8225	u64 max = min(0x7fffffffULL,
				8226	__scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
				8227	kvm_max_guest_tsc_khz = max;
				8228
				8229	kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
				8230	}
				8231
				8232	kvm_init_msr_list();
				8233	return 0;
				8234	}
				8235
				8236	void kvm_arch_hardware_unsetup(void)
				8237	{
				8238	kvm_x86_ops->hardware_unsetup();
				8239	}
				8240
				8241	void kvm_arch_check_processor_compat(void *rtn)
				8242	{
				8243	kvm_x86_ops->check_processor_compatibility(rtn);
				8244	}
				8245
				8246	bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
				8247	{
				8248	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
				8249	}
				8250	EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
				8251
				8252	bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
				8253	{
				8254	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
				8255	}
				8256
				8257	struct static_key kvm_no_apic_vcpu __read_mostly;
				8258	EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
				8259
				8260	int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
				8261	{
				8262	struct page *page;
				8263	struct kvm *kvm;
				8264	int r;
				8265
				8266	BUG_ON(vcpu->kvm == NULL);
				8267	kvm = vcpu->kvm;
				8268
				8269	vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
				8270	vcpu->arch.pv.pv_unhalted = false;
				8271	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
				8272	if (!irqchip_in_kernel(kvm) \|\| kvm_vcpu_is_reset_bsp(vcpu))
				8273	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
				8274	else
				8275	vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
				8276
				8277	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				8278	if (!page) {
				8279	r = -ENOMEM;
				8280	goto fail;
				8281	}
				8282	vcpu->arch.pio_data = page_address(page);
				8283
				8284	kvm_set_tsc_khz(vcpu, max_tsc_khz);
				8285
				8286	r = kvm_mmu_create(vcpu);
				8287	if (r < 0)
				8288	goto fail_free_pio_data;
				8289
				8290	if (irqchip_in_kernel(kvm)) {
				8291	r = kvm_create_lapic(vcpu);
				8292	if (r < 0)
				8293	goto fail_mmu_destroy;
				8294	} else
				8295	static_key_slow_inc(&kvm_no_apic_vcpu);
				8296
				8297	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
				8298	GFP_KERNEL);
				8299	if (!vcpu->arch.mce_banks) {
				8300	r = -ENOMEM;
				8301	goto fail_free_lapic;
				8302	}
				8303	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
				8304
				8305	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
				8306	r = -ENOMEM;
				8307	goto fail_free_mce_banks;
				8308	}
				8309
				8310	fx_init(vcpu);
				8311
				8312	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
				8313	vcpu->arch.pv_time_enabled = false;
				8314
				8315	vcpu->arch.guest_supported_xcr0 = 0;
				8316	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
				8317
				8318	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
				8319
				8320	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
				8321
				8322	kvm_async_pf_hash_reset(vcpu);
				8323	kvm_pmu_init(vcpu);
				8324
				8325	vcpu->arch.pending_external_vector = -1;
				8326	vcpu->arch.preempted_in_kernel = false;
				8327
				8328	kvm_hv_vcpu_init(vcpu);
				8329
				8330	return 0;
				8331
				8332	fail_free_mce_banks:
				8333	kfree(vcpu->arch.mce_banks);
				8334	fail_free_lapic:
				8335	kvm_free_lapic(vcpu);
				8336	fail_mmu_destroy:
				8337	kvm_mmu_destroy(vcpu);
				8338	fail_free_pio_data:
				8339	free_page((unsigned long)vcpu->arch.pio_data);
				8340	fail:
				8341	return r;
				8342	}
				8343
				8344	void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
				8345	{
				8346	int idx;
				8347
				8348	kvm_hv_vcpu_uninit(vcpu);
				8349	kvm_pmu_destroy(vcpu);
				8350	kfree(vcpu->arch.mce_banks);
				8351	kvm_free_lapic(vcpu);
				8352	idx = srcu_read_lock(&vcpu->kvm->srcu);
				8353	kvm_mmu_destroy(vcpu);
				8354	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				8355	free_page((unsigned long)vcpu->arch.pio_data);
				8356	if (!lapic_in_kernel(vcpu))
				8357	static_key_slow_dec(&kvm_no_apic_vcpu);
				8358	}
				8359
				8360	void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
				8361	{
				8362	vcpu->arch.l1tf_flush_l1d = true;
				8363	kvm_x86_ops->sched_in(vcpu, cpu);
				8364	}
				8365
				8366	int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
				8367	{
				8368	if (type)
				8369	return -EINVAL;
				8370
				8371	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
				8372	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
				8373	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
				8374	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
				8375	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
				8376	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
				8377
				8378	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
				8379	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
				8380	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
				8381	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
				8382	&kvm->arch.irq_sources_bitmap);
				8383
				8384	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
				8385	mutex_init(&kvm->arch.apic_map_lock);
				8386	mutex_init(&kvm->arch.hyperv.hv_lock);
				8387	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
				8388
				8389	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
				8390	pvclock_update_vm_gtod_copy(kvm);
				8391
				8392	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
				8393	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
				8394
				8395	kvm_page_track_init(kvm);
				8396	kvm_mmu_init_vm(kvm);
				8397
				8398	if (kvm_x86_ops->vm_init)
				8399	return kvm_x86_ops->vm_init(kvm);
				8400
				8401	return 0;
				8402	}
				8403
				8404	int kvm_arch_post_init_vm(struct kvm *kvm)
				8405	{
				8406	return kvm_mmu_post_init_vm(kvm);
				8407	}
				8408
				8409	static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
				8410	{
				8411	int r;
				8412	r = vcpu_load(vcpu);
				8413	BUG_ON(r);
				8414	kvm_mmu_unload(vcpu);
				8415	vcpu_put(vcpu);
				8416	}
				8417
				8418	static void kvm_free_vcpus(struct kvm *kvm)
				8419	{
				8420	unsigned int i;
				8421	struct kvm_vcpu *vcpu;
				8422
				8423	/*
				8424	* Unpin any mmu pages first.
				8425	*/
				8426	kvm_for_each_vcpu(i, vcpu, kvm) {
				8427	kvm_clear_async_pf_completion_queue(vcpu);
				8428	kvm_unload_vcpu_mmu(vcpu);
				8429	}
				8430	kvm_for_each_vcpu(i, vcpu, kvm)
				8431	kvm_arch_vcpu_free(vcpu);
				8432
				8433	mutex_lock(&kvm->lock);
				8434	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
				8435	kvm->vcpus[i] = NULL;
				8436
				8437	atomic_set(&kvm->online_vcpus, 0);
				8438	mutex_unlock(&kvm->lock);
				8439	}
				8440
				8441	void kvm_arch_sync_events(struct kvm *kvm)
				8442	{
				8443	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
				8444	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
				8445	kvm_free_pit(kvm);
				8446	}
				8447
				8448	int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
				8449	{
				8450	int i, r;
				8451	unsigned long hva;
				8452	struct kvm_memslots *slots = kvm_memslots(kvm);
				8453	struct kvm_memory_slot *slot, old;
				8454
				8455	/* Called with kvm->slots_lock held. */
				8456	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
				8457	return -EINVAL;
				8458
				8459	slot = id_to_memslot(slots, id);
				8460	if (size) {
				8461	if (slot->npages)
				8462	return -EEXIST;
				8463
				8464	/*
				8465	* MAP_SHARED to prevent internal slot pages from being moved
				8466	* by fork()/COW.
				8467	*/
				8468	hva = vm_mmap(NULL, 0, size, PROT_READ \| PROT_WRITE,
				8469	MAP_SHARED \| MAP_ANONYMOUS, 0);
				8470	if (IS_ERR((void *)hva))
				8471	return PTR_ERR((void *)hva);
				8472	} else {
				8473	if (!slot->npages)
				8474	return 0;
				8475
				8476	hva = 0;
				8477	}
				8478
				8479	old = *slot;
				8480	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				8481	struct kvm_userspace_memory_region m;
				8482
				8483	m.slot = id \| (i << 16);
				8484	m.flags = 0;
				8485	m.guest_phys_addr = gpa;
				8486	m.userspace_addr = hva;
				8487	m.memory_size = size;
				8488	r = __kvm_set_memory_region(kvm, &m);
				8489	if (r < 0)
				8490	return r;
				8491	}
				8492
				8493	if (!size)
				8494	vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
				8495
				8496	return 0;
				8497	}
				8498	EXPORT_SYMBOL_GPL(__x86_set_memory_region);
				8499
				8500	int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
				8501	{
				8502	int r;
				8503
				8504	mutex_lock(&kvm->slots_lock);
				8505	r = __x86_set_memory_region(kvm, id, gpa, size);
				8506	mutex_unlock(&kvm->slots_lock);
				8507
				8508	return r;
				8509	}
				8510	EXPORT_SYMBOL_GPL(x86_set_memory_region);
				8511
				8512	void kvm_arch_pre_destroy_vm(struct kvm *kvm)
				8513	{
				8514	kvm_mmu_pre_destroy_vm(kvm);
				8515	}
				8516
				8517	void kvm_arch_destroy_vm(struct kvm *kvm)
				8518	{
				8519	if (current->mm == kvm->mm) {
				8520	/*
				8521	* Free memory regions allocated on behalf of userspace,
				8522	* unless the the memory map has changed due to process exit
				8523	* or fd copying.
				8524	*/
				8525	x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
				8526	x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
				8527	x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
				8528	}
				8529	if (kvm_x86_ops->vm_destroy)
				8530	kvm_x86_ops->vm_destroy(kvm);
				8531	kvm_pic_destroy(kvm);
				8532	kvm_ioapic_destroy(kvm);
				8533	kvm_free_vcpus(kvm);
				8534	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
				8535	kvm_mmu_uninit_vm(kvm);
				8536	kvm_page_track_cleanup(kvm);
				8537	}
				8538
				8539	void kvm_arch_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
				8540	struct kvm_memory_slot *dont)
				8541	{
				8542	int i;
				8543
				8544	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
				8545	if (!dont \|\| free->arch.rmap[i] != dont->arch.rmap[i]) {
				8546	kvfree(free->arch.rmap[i]);
				8547	free->arch.rmap[i] = NULL;
				8548	}
				8549	if (i == 0)
				8550	continue;
				8551
				8552	if (!dont \|\| free->arch.lpage_info[i - 1] !=
				8553	dont->arch.lpage_info[i - 1]) {
				8554	kvfree(free->arch.lpage_info[i - 1]);
				8555	free->arch.lpage_info[i - 1] = NULL;
				8556	}
				8557	}
				8558
				8559	kvm_page_track_free_memslot(free, dont);
				8560	}
				8561
				8562	int kvm_arch_create_memslot(struct kvm kvm, struct kvm_memory_slot slot,
				8563	unsigned long npages)
				8564	{
				8565	int i;
				8566
				8567	/*
				8568	* Clear out the previous array pointers for the KVM_MR_MOVE case. The
				8569	* old arrays will be freed by __kvm_set_memory_region() if installing
				8570	* the new memslot is successful.
				8571	*/
				8572	memset(&slot->arch, 0, sizeof(slot->arch));
				8573
				8574	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
				8575	struct kvm_lpage_info *linfo;
				8576	unsigned long ugfn;
				8577	int lpages;
				8578	int level = i + 1;
				8579
				8580	lpages = gfn_to_index(slot->base_gfn + npages - 1,
				8581	slot->base_gfn, level) + 1;
				8582
				8583	slot->arch.rmap[i] =
				8584	kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL);
				8585	if (!slot->arch.rmap[i])
				8586	goto out_free;
				8587	if (i == 0)
				8588	continue;
				8589
				8590	linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL);
				8591	if (!linfo)
				8592	goto out_free;
				8593
				8594	slot->arch.lpage_info[i - 1] = linfo;
				8595
				8596	if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
				8597	linfo[0].disallow_lpage = 1;
				8598	if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
				8599	linfo[lpages - 1].disallow_lpage = 1;
				8600	ugfn = slot->userspace_addr >> PAGE_SHIFT;
				8601	/*
				8602	* If the gfn and userspace address are not aligned wrt each
				8603	* other, or if explicitly asked to, disable large page
				8604	* support for this slot
				8605	*/
				8606	if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) \|\|
				8607	!kvm_largepages_enabled()) {
				8608	unsigned long j;
				8609
				8610	for (j = 0; j < lpages; ++j)
				8611	linfo[j].disallow_lpage = 1;
				8612	}
				8613	}
				8614
				8615	if (kvm_page_track_create_memslot(slot, npages))
				8616	goto out_free;
				8617
				8618	return 0;
				8619
				8620	out_free:
				8621	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
				8622	kvfree(slot->arch.rmap[i]);
				8623	slot->arch.rmap[i] = NULL;
				8624	if (i == 0)
				8625	continue;
				8626
				8627	kvfree(slot->arch.lpage_info[i - 1]);
				8628	slot->arch.lpage_info[i - 1] = NULL;
				8629	}
				8630	return -ENOMEM;
				8631	}
				8632
				8633	void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
				8634	{
				8635	/*
				8636	* memslots->generation has been incremented.
				8637	* mmio generation may have reached its maximum value.
				8638	*/
				8639	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
				8640	}
				8641
				8642	int kvm_arch_prepare_memory_region(struct kvm *kvm,
				8643	struct kvm_memory_slot *memslot,
				8644	const struct kvm_userspace_memory_region *mem,
				8645	enum kvm_mr_change change)
				8646	{
				8647	if (change == KVM_MR_MOVE)
				8648	return kvm_arch_create_memslot(kvm, memslot,
				8649	mem->memory_size >> PAGE_SHIFT);
				8650
				8651	return 0;
				8652	}
				8653
				8654	static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
				8655	struct kvm_memory_slot *new)
				8656	{
				8657	/* Still write protect RO slot */
				8658	if (new->flags & KVM_MEM_READONLY) {
				8659	kvm_mmu_slot_remove_write_access(kvm, new);
				8660	return;
				8661	}
				8662
				8663	/*
				8664	* Call kvm_x86_ops dirty logging hooks when they are valid.
				8665	*
				8666	* kvm_x86_ops->slot_disable_log_dirty is called when:
				8667	*
				8668	* - KVM_MR_CREATE with dirty logging is disabled
				8669	* - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
				8670	*
				8671	* The reason is, in case of PML, we need to set D-bit for any slots
				8672	* with dirty logging disabled in order to eliminate unnecessary GPA
				8673	* logging in PML buffer (and potential PML buffer full VMEXT). This
				8674	* guarantees leaving PML enabled during guest's lifetime won't have
				8675	* any additonal overhead from PML when guest is running with dirty
				8676	* logging disabled for memory slots.
				8677	*
				8678	* kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
				8679	* to dirty logging mode.
				8680	*
				8681	* If kvm_x86_ops dirty logging hooks are invalid, use write protect.
				8682	*
				8683	* In case of write protect:
				8684	*
				8685	* Write protect all pages for dirty logging.
				8686	*
				8687	* All the sptes including the large sptes which point to this
				8688	* slot are set to readonly. We can not create any new large
				8689	* spte on this slot until the end of the logging.
				8690	*
				8691	* See the comments in fast_page_fault().
				8692	*/
				8693	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
				8694	if (kvm_x86_ops->slot_enable_log_dirty)
				8695	kvm_x86_ops->slot_enable_log_dirty(kvm, new);
				8696	else
				8697	kvm_mmu_slot_remove_write_access(kvm, new);
				8698	} else {
				8699	if (kvm_x86_ops->slot_disable_log_dirty)
				8700	kvm_x86_ops->slot_disable_log_dirty(kvm, new);
				8701	}
				8702	}
				8703
				8704	void kvm_arch_commit_memory_region(struct kvm *kvm,
				8705	const struct kvm_userspace_memory_region *mem,
				8706	const struct kvm_memory_slot *old,
				8707	const struct kvm_memory_slot *new,
				8708	enum kvm_mr_change change)
				8709	{
				8710	int nr_mmu_pages = 0;
				8711
				8712	if (!kvm->arch.n_requested_mmu_pages)
				8713	nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
				8714
				8715	if (nr_mmu_pages)
				8716	kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
				8717
				8718	/*
				8719	* Dirty logging tracks sptes in 4k granularity, meaning that large
				8720	* sptes have to be split. If live migration is successful, the guest
				8721	* in the source machine will be destroyed and large sptes will be
				8722	* created in the destination. However, if the guest continues to run
				8723	* in the source machine (for example if live migration fails), small
				8724	* sptes will remain around and cause bad performance.
				8725	*
				8726	* Scan sptes if dirty logging has been stopped, dropping those
				8727	* which can be collapsed into a single large-page spte. Later
				8728	* page faults will create the large-page sptes.
				8729	*/
				8730	if ((change != KVM_MR_DELETE) &&
				8731	(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
				8732	!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
				8733	kvm_mmu_zap_collapsible_sptes(kvm, new);
				8734
				8735	/*
				8736	* Set up write protection and/or dirty logging for the new slot.
				8737	*
				8738	* For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
				8739	* been zapped so no dirty logging staff is needed for old slot. For
				8740	* KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
				8741	* new and it's also covered when dealing with the new slot.
				8742	*
				8743	* FIXME: const-ify all uses of struct kvm_memory_slot.
				8744	*/
				8745	if (change != KVM_MR_DELETE)
				8746	kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
				8747	}
				8748
				8749	void kvm_arch_flush_shadow_all(struct kvm *kvm)
				8750	{
				8751	kvm_mmu_invalidate_zap_all_pages(kvm);
				8752	}
				8753
				8754	void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
				8755	struct kvm_memory_slot *slot)
				8756	{
				8757	kvm_page_track_flush_slot(kvm, slot);
				8758	}
				8759
				8760	static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
				8761	{
				8762	if (!list_empty_careful(&vcpu->async_pf.done))
				8763	return true;
				8764
				8765	if (kvm_apic_has_events(vcpu))
				8766	return true;
				8767
				8768	if (vcpu->arch.pv.pv_unhalted)
				8769	return true;
				8770
				8771	if (vcpu->arch.exception.pending)
				8772	return true;
				8773
				8774	if (kvm_test_request(KVM_REQ_NMI, vcpu) \|\|
				8775	(vcpu->arch.nmi_pending &&
				8776	kvm_x86_ops->nmi_allowed(vcpu)))
				8777	return true;
				8778
				8779	if (kvm_test_request(KVM_REQ_SMI, vcpu) \|\|
				8780	(vcpu->arch.smi_pending && !is_smm(vcpu)))
				8781	return true;
				8782
				8783	if (kvm_arch_interrupt_allowed(vcpu) &&
				8784	kvm_cpu_has_interrupt(vcpu))
				8785	return true;
				8786
				8787	if (kvm_hv_has_stimer_pending(vcpu))
				8788	return true;
				8789
				8790	return false;
				8791	}
				8792
				8793	int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
				8794	{
				8795	return kvm_vcpu_running(vcpu) \|\| kvm_vcpu_has_events(vcpu);
				8796	}
				8797
				8798	bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
				8799	{
				8800	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
				8801	return true;
				8802
				8803	if (kvm_test_request(KVM_REQ_NMI, vcpu) \|\|
				8804	kvm_test_request(KVM_REQ_SMI, vcpu) \|\|
				8805	kvm_test_request(KVM_REQ_EVENT, vcpu))
				8806	return true;
				8807
				8808	if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
				8809	return true;
				8810
				8811	return false;
				8812	}
				8813
				8814	bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
				8815	{
				8816	return vcpu->arch.preempted_in_kernel;
				8817	}
				8818
				8819	int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
				8820	{
				8821	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
				8822	}
				8823
				8824	int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
				8825	{
				8826	return kvm_x86_ops->interrupt_allowed(vcpu);
				8827	}
				8828
				8829	unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
				8830	{
				8831	if (is_64_bit_mode(vcpu))
				8832	return kvm_rip_read(vcpu);
				8833	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
				8834	kvm_rip_read(vcpu));
				8835	}
				8836	EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
				8837
				8838	bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
				8839	{
				8840	return kvm_get_linear_rip(vcpu) == linear_rip;
				8841	}
				8842	EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
				8843
				8844	unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
				8845	{
				8846	unsigned long rflags;
				8847
				8848	rflags = kvm_x86_ops->get_rflags(vcpu);
				8849	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
				8850	rflags &= ~X86_EFLAGS_TF;
				8851	return rflags;
				8852	}
				8853	EXPORT_SYMBOL_GPL(kvm_get_rflags);
				8854
				8855	static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
				8856	{
				8857	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
				8858	kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
				8859	rflags \|= X86_EFLAGS_TF;
				8860	kvm_x86_ops->set_rflags(vcpu, rflags);
				8861	}
				8862
				8863	void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
				8864	{
				8865	__kvm_set_rflags(vcpu, rflags);
				8866	kvm_make_request(KVM_REQ_EVENT, vcpu);
				8867	}
				8868	EXPORT_SYMBOL_GPL(kvm_set_rflags);
				8869
				8870	void kvm_arch_async_page_ready(struct kvm_vcpu vcpu, struct kvm_async_pf work)
				8871	{
				8872	int r;
				8873
				8874	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) \|\|
				8875	work->wakeup_all)
				8876	return;
				8877
				8878	r = kvm_mmu_reload(vcpu);
				8879	if (unlikely(r))
				8880	return;
				8881
				8882	if (!vcpu->arch.mmu.direct_map &&
				8883	work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
				8884	return;
				8885
				8886	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
				8887	}
				8888
				8889	static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
				8890	{
				8891	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
				8892	}
				8893
				8894	static inline u32 kvm_async_pf_next_probe(u32 key)
				8895	{
				8896	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
				8897	}
				8898
				8899	static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				8900	{
				8901	u32 key = kvm_async_pf_hash_fn(gfn);
				8902
				8903	while (vcpu->arch.apf.gfns[key] != ~0)
				8904	key = kvm_async_pf_next_probe(key);
				8905
				8906	vcpu->arch.apf.gfns[key] = gfn;
				8907	}
				8908
				8909	static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
				8910	{
				8911	int i;
				8912	u32 key = kvm_async_pf_hash_fn(gfn);
				8913
				8914	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
				8915	(vcpu->arch.apf.gfns[key] != gfn &&
				8916	vcpu->arch.apf.gfns[key] != ~0); i++)
				8917	key = kvm_async_pf_next_probe(key);
				8918
				8919	return key;
				8920	}
				8921
				8922	bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				8923	{
				8924	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
				8925	}
				8926
				8927	static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				8928	{
				8929	u32 i, j, k;
				8930
				8931	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
				8932	while (true) {
				8933	vcpu->arch.apf.gfns[i] = ~0;
				8934	do {
				8935	j = kvm_async_pf_next_probe(j);
				8936	if (vcpu->arch.apf.gfns[j] == ~0)
				8937	return;
				8938	k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
				8939	/*
				8940	* k lies cyclically in ]i,j]
				8941	* \| i.k.j \|
				8942	* \|....j i.k.\| or \|.k..j i...\|
				8943	*/
				8944	} while ((i <= j) ? (i < k && k <= j) : (i < k \|\| k <= j));
				8945	vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
				8946	i = j;
				8947	}
				8948	}
				8949
				8950	static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
				8951	{
				8952
				8953	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
				8954	sizeof(val));
				8955	}
				8956
				8957	static int apf_get_user(struct kvm_vcpu vcpu, u32 val)
				8958	{
				8959
				8960	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
				8961	sizeof(u32));
				8962	}
				8963
				8964	void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
				8965	struct kvm_async_pf *work)
				8966	{
				8967	struct x86_exception fault;
				8968
				8969	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
				8970	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
				8971
				8972	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) \|\|
				8973	(vcpu->arch.apf.send_user_only &&
				8974	kvm_x86_ops->get_cpl(vcpu) == 0))
				8975	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
				8976	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
				8977	fault.vector = PF_VECTOR;
				8978	fault.error_code_valid = true;
				8979	fault.error_code = 0;
				8980	fault.nested_page_fault = false;
				8981	fault.address = work->arch.token;
				8982	fault.async_page_fault = true;
				8983	kvm_inject_page_fault(vcpu, &fault);
				8984	}
				8985	}
				8986
				8987	void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
				8988	struct kvm_async_pf *work)
				8989	{
				8990	struct x86_exception fault;
				8991	u32 val;
				8992
				8993	if (work->wakeup_all)
				8994	work->arch.token = ~0; /* broadcast wakeup */
				8995	else
				8996	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
				8997	trace_kvm_async_pf_ready(work->arch.token, work->gva);
				8998
				8999	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
				9000	!apf_get_user(vcpu, &val)) {
				9001	if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
				9002	vcpu->arch.exception.pending &&
				9003	vcpu->arch.exception.nr == PF_VECTOR &&
				9004	!apf_put_user(vcpu, 0)) {
				9005	vcpu->arch.exception.injected = false;
				9006	vcpu->arch.exception.pending = false;
				9007	vcpu->arch.exception.nr = 0;
				9008	vcpu->arch.exception.has_error_code = false;
				9009	vcpu->arch.exception.error_code = 0;
				9010	} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
				9011	fault.vector = PF_VECTOR;
				9012	fault.error_code_valid = true;
				9013	fault.error_code = 0;
				9014	fault.nested_page_fault = false;
				9015	fault.address = work->arch.token;
				9016	fault.async_page_fault = true;
				9017	kvm_inject_page_fault(vcpu, &fault);
				9018	}
				9019	}
				9020	vcpu->arch.apf.halted = false;
				9021	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
				9022	}
				9023
				9024	bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
				9025	{
				9026	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
				9027	return true;
				9028	else
				9029	return kvm_can_do_async_pf(vcpu);
				9030	}
				9031
				9032	void kvm_arch_start_assignment(struct kvm *kvm)
				9033	{
				9034	atomic_inc(&kvm->arch.assigned_device_count);
				9035	}
				9036	EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
				9037
				9038	void kvm_arch_end_assignment(struct kvm *kvm)
				9039	{
				9040	atomic_dec(&kvm->arch.assigned_device_count);
				9041	}
				9042	EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
				9043
				9044	bool kvm_arch_has_assigned_device(struct kvm *kvm)
				9045	{
				9046	return atomic_read(&kvm->arch.assigned_device_count);
				9047	}
				9048	EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
				9049
				9050	void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
				9051	{
				9052	atomic_inc(&kvm->arch.noncoherent_dma_count);
				9053	}
				9054	EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
				9055
				9056	void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
				9057	{
				9058	atomic_dec(&kvm->arch.noncoherent_dma_count);
				9059	}
				9060	EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
				9061
				9062	bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
				9063	{
				9064	return atomic_read(&kvm->arch.noncoherent_dma_count);
				9065	}
				9066	EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
				9067
				9068	bool kvm_arch_has_irq_bypass(void)
				9069	{
				9070	return kvm_x86_ops->update_pi_irte != NULL;
				9071	}
				9072
				9073	int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
				9074	struct irq_bypass_producer *prod)
				9075	{
				9076	struct kvm_kernel_irqfd *irqfd =
				9077	container_of(cons, struct kvm_kernel_irqfd, consumer);
				9078
				9079	irqfd->producer = prod;
				9080
				9081	return kvm_x86_ops->update_pi_irte(irqfd->kvm,
				9082	prod->irq, irqfd->gsi, 1);
				9083	}
				9084
				9085	void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
				9086	struct irq_bypass_producer *prod)
				9087	{
				9088	int ret;
				9089	struct kvm_kernel_irqfd *irqfd =
				9090	container_of(cons, struct kvm_kernel_irqfd, consumer);
				9091
				9092	WARN_ON(irqfd->producer != prod);
				9093	irqfd->producer = NULL;
				9094
				9095	/*
				9096	* When producer of consumer is unregistered, we change back to
				9097	* remapped mode, so we can re-use the current implementation
				9098	* when the irq is masked/disabled or the consumer side (KVM
				9099	* int this case doesn't want to receive the interrupts.
				9100	*/
				9101	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
				9102	if (ret)
				9103	printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
				9104	" fails: %d\n", irqfd->consumer.token, ret);
				9105	}
				9106
				9107	int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
				9108	uint32_t guest_irq, bool set)
				9109	{
				9110	if (!kvm_x86_ops->update_pi_irte)
				9111	return -EINVAL;
				9112
				9113	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
				9114	}
				9115
				9116	bool kvm_vector_hashing_enabled(void)
				9117	{
				9118	return vector_hashing;
				9119	}
				9120	EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
				9121
				9122	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
				9123	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
				9124	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
				9125	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
				9126	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
				9127	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
				9128	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
				9129	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
				9130	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
				9131	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
				9132	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
				9133	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
				9134	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
				9135	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
				9136	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
				9137	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
				9138	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
				9139	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
				9140	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);