Blame - src/kernel/linux/v4.19/arch/x86/kvm/svm.c - T800

blob: 7657dcd72134bd607fdd38ad62faf6e45db4241b [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* AMD SVM support
				5	*
				6	* Copyright (C) 2006 Qumranet, Inc.
				7	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				8	*
				9	* Authors:
				10	* Yaniv Kamay <yaniv@qumranet.com>
				11	* Avi Kivity <avi@qumranet.com>
				12	*
				13	* This work is licensed under the terms of the GNU GPL, version 2. See
				14	* the COPYING file in the top-level directory.
				15	*
				16	*/
				17
				18	#define pr_fmt(fmt) "SVM: " fmt
				19
				20	#include <linux/kvm_host.h>
				21
				22	#include "irq.h"
				23	#include "mmu.h"
				24	#include "kvm_cache_regs.h"
				25	#include "x86.h"
				26	#include "cpuid.h"
				27	#include "pmu.h"
				28
				29	#include <linux/module.h>
				30	#include <linux/mod_devicetable.h>
				31	#include <linux/kernel.h>
				32	#include <linux/vmalloc.h>
				33	#include <linux/highmem.h>
				34	#include <linux/sched.h>
				35	#include <linux/trace_events.h>
				36	#include <linux/slab.h>
				37	#include <linux/amd-iommu.h>
				38	#include <linux/hashtable.h>
				39	#include <linux/frame.h>
				40	#include <linux/psp-sev.h>
				41	#include <linux/file.h>
				42	#include <linux/pagemap.h>
				43	#include <linux/swap.h>
				44
				45	#include <asm/apic.h>
				46	#include <asm/perf_event.h>
				47	#include <asm/tlbflush.h>
				48	#include <asm/desc.h>
				49	#include <asm/debugreg.h>
				50	#include <asm/kvm_para.h>
				51	#include <asm/irq_remapping.h>
				52	#include <asm/spec-ctrl.h>
				53
				54	#include <asm/virtext.h>
				55	#include "trace.h"
				56
				57	#define __ex(x) __kvm_handle_fault_on_reboot(x)
				58
				59	MODULE_AUTHOR("Qumranet");
				60	MODULE_LICENSE("GPL");
				61
				62	static const struct x86_cpu_id svm_cpu_id[] = {
				63	X86_FEATURE_MATCH(X86_FEATURE_SVM),
				64	{}
				65	};
				66	MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
				67
				68	#define IOPM_ALLOC_ORDER 2
				69	#define MSRPM_ALLOC_ORDER 1
				70
				71	#define SEG_TYPE_LDT 2
				72	#define SEG_TYPE_BUSY_TSS16 3
				73
				74	#define SVM_FEATURE_NPT (1 << 0)
				75	#define SVM_FEATURE_LBRV (1 << 1)
				76	#define SVM_FEATURE_SVML (1 << 2)
				77	#define SVM_FEATURE_NRIP (1 << 3)
				78	#define SVM_FEATURE_TSC_RATE (1 << 4)
				79	#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
				80	#define SVM_FEATURE_FLUSH_ASID (1 << 6)
				81	#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
				82	#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
				83
				84	#define SVM_AVIC_DOORBELL 0xc001011b
				85
				86	#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
				87	#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
				88	#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
				89
				90	#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
				91
				92	#define TSC_RATIO_RSVD 0xffffff0000000000ULL
				93	#define TSC_RATIO_MIN 0x0000000000000001ULL
				94	#define TSC_RATIO_MAX 0x000000ffffffffffULL
				95
				96	#define AVIC_HPA_MASK ~((0xFFFULL << 52) \| 0xFFF)
				97
				98	/*
				99	* 0xff is broadcast, so the max index allowed for physical APIC ID
				100	* table is 0xfe. APIC IDs above 0xff are reserved.
				101	*/
				102	#define AVIC_MAX_PHYSICAL_ID_COUNT 255
				103
				104	#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
				105	#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
				106	#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
				107
				108	/* AVIC GATAG is encoded using VM and VCPU IDs */
				109	#define AVIC_VCPU_ID_BITS 8
				110	#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
				111
				112	#define AVIC_VM_ID_BITS 24
				113	#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
				114	#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
				115
				116	#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) \| \
				117	(y & AVIC_VCPU_ID_MASK))
				118	#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
				119	#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
				120
				121	static bool erratum_383_found __read_mostly;
				122
				123	static const u32 host_save_user_msrs[] = {
				124	#ifdef CONFIG_X86_64
				125	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
				126	MSR_FS_BASE,
				127	#endif
				128	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
				129	MSR_TSC_AUX,
				130	};
				131
				132	#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
				133
				134	struct kvm_sev_info {
				135	bool active; /* SEV enabled guest */
				136	unsigned int asid; /* ASID used for this guest */
				137	unsigned int handle; /* SEV firmware handle */
				138	int fd; /* SEV device fd */
				139	unsigned long pages_locked; /* Number of pages locked */
				140	struct list_head regions_list; /* List of registered regions */
				141	};
				142
				143	struct kvm_svm {
				144	struct kvm kvm;
				145
				146	/* Struct members for AVIC */
				147	u32 avic_vm_id;
				148	u32 ldr_mode;
				149	struct page *avic_logical_id_table_page;
				150	struct page *avic_physical_id_table_page;
				151	struct hlist_node hnode;
				152
				153	struct kvm_sev_info sev_info;
				154	};
				155
				156	struct kvm_vcpu;
				157
				158	struct nested_state {
				159	struct vmcb *hsave;
				160	u64 hsave_msr;
				161	u64 vm_cr_msr;
				162	u64 vmcb;
				163
				164	/* These are the merged vectors */
				165	u32 *msrpm;
				166
				167	/* gpa pointers to the real vectors */
				168	u64 vmcb_msrpm;
				169	u64 vmcb_iopm;
				170
				171	/* A VMEXIT is required but not yet emulated */
				172	bool exit_required;
				173
				174	/* cache for intercepts of the guest */
				175	u32 intercept_cr;
				176	u32 intercept_dr;
				177	u32 intercept_exceptions;
				178	u64 intercept;
				179
				180	/* Nested Paging related state */
				181	u64 nested_cr3;
				182	};
				183
				184	#define MSRPM_OFFSETS 16
				185	static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
				186
				187	/*
				188	* Set osvw_len to higher value when updated Revision Guides
				189	* are published and we know what the new status bits are
				190	*/
				191	static uint64_t osvw_len = 4, osvw_status;
				192
				193	struct vcpu_svm {
				194	struct kvm_vcpu vcpu;
				195	struct vmcb *vmcb;
				196	unsigned long vmcb_pa;
				197	struct svm_cpu_data *svm_data;
				198	uint64_t asid_generation;
				199	uint64_t sysenter_esp;
				200	uint64_t sysenter_eip;
				201	uint64_t tsc_aux;
				202
				203	u64 msr_decfg;
				204
				205	u64 next_rip;
				206
				207	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
				208	struct {
				209	u16 fs;
				210	u16 gs;
				211	u16 ldt;
				212	u64 gs_base;
				213	} host;
				214
				215	u64 spec_ctrl;
				216	/*
				217	* Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
				218	* translated into the appropriate L2_CFG bits on the host to
				219	* perform speculative control.
				220	*/
				221	u64 virt_spec_ctrl;
				222
				223	u32 *msrpm;
				224
				225	ulong nmi_iret_rip;
				226
				227	struct nested_state nested;
				228
				229	bool nmi_singlestep;
				230	u64 nmi_singlestep_guest_rflags;
				231
				232	unsigned int3_injected;
				233	unsigned long int3_rip;
				234
				235	/* cached guest cpuid flags for faster access */
				236	bool nrips_enabled : 1;
				237
				238	u32 ldr_reg;
				239	struct page *avic_backing_page;
				240	u64 *avic_physical_id_cache;
				241	bool avic_is_running;
				242
				243	/*
				244	* Per-vcpu list of struct amd_svm_iommu_ir:
				245	* This is used mainly to store interrupt remapping information used
				246	* when update the vcpu affinity. This avoids the need to scan for
				247	* IRTE and try to match ga_tag in the IOMMU driver.
				248	*/
				249	struct list_head ir_list;
				250	spinlock_t ir_list_lock;
				251
				252	/* which host CPU was used for running this vcpu */
				253	unsigned int last_cpu;
				254	};
				255
				256	/*
				257	* This is a wrapper of struct amd_iommu_ir_data.
				258	*/
				259	struct amd_svm_iommu_ir {
				260	struct list_head node; /* Used by SVM for per-vcpu ir_list */
				261	void data; / Storing pointer to struct amd_ir_data */
				262	};
				263
				264	#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
				265	#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
				266
				267	#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
				268	#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
				269	#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
				270	#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
				271
				272	static DEFINE_PER_CPU(u64, current_tsc_ratio);
				273	#define TSC_RATIO_DEFAULT 0x0100000000ULL
				274
				275	#define MSR_INVALID 0xffffffffU
				276
				277	static const struct svm_direct_access_msrs {
				278	u32 index; /* Index of the MSR */
				279	bool always; /* True if intercept is always on */
				280	} direct_access_msrs[] = {
				281	{ .index = MSR_STAR, .always = true },
				282	{ .index = MSR_IA32_SYSENTER_CS, .always = true },
				283	#ifdef CONFIG_X86_64
				284	{ .index = MSR_GS_BASE, .always = true },
				285	{ .index = MSR_FS_BASE, .always = true },
				286	{ .index = MSR_KERNEL_GS_BASE, .always = true },
				287	{ .index = MSR_LSTAR, .always = true },
				288	{ .index = MSR_CSTAR, .always = true },
				289	{ .index = MSR_SYSCALL_MASK, .always = true },
				290	#endif
				291	{ .index = MSR_IA32_SPEC_CTRL, .always = false },
				292	{ .index = MSR_IA32_PRED_CMD, .always = false },
				293	{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
				294	{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
				295	{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
				296	{ .index = MSR_IA32_LASTINTTOIP, .always = false },
				297	{ .index = MSR_INVALID, .always = false },
				298	};
				299
				300	/* enable NPT for AMD64 and X86 with PAE */
				301	#if defined(CONFIG_X86_64) \|\| defined(CONFIG_X86_PAE)
				302	static bool npt_enabled = true;
				303	#else
				304	static bool npt_enabled;
				305	#endif
				306
				307	/*
				308	* These 2 parameters are used to config the controls for Pause-Loop Exiting:
				309	* pause_filter_count: On processors that support Pause filtering(indicated
				310	* by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
				311	* count value. On VMRUN this value is loaded into an internal counter.
				312	* Each time a pause instruction is executed, this counter is decremented
				313	* until it reaches zero at which time a #VMEXIT is generated if pause
				314	* intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
				315	* Intercept Filtering for more details.
				316	* This also indicate if ple logic enabled.
				317	*
				318	* pause_filter_thresh: In addition, some processor families support advanced
				319	* pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
				320	* the amount of time a guest is allowed to execute in a pause loop.
				321	* In this mode, a 16-bit pause filter threshold field is added in the
				322	* VMCB. The threshold value is a cycle count that is used to reset the
				323	* pause counter. As with simple pause filtering, VMRUN loads the pause
				324	* count value from VMCB into an internal counter. Then, on each pause
				325	* instruction the hardware checks the elapsed number of cycles since
				326	* the most recent pause instruction against the pause filter threshold.
				327	* If the elapsed cycle count is greater than the pause filter threshold,
				328	* then the internal pause count is reloaded from the VMCB and execution
				329	* continues. If the elapsed cycle count is less than the pause filter
				330	* threshold, then the internal pause count is decremented. If the count
				331	* value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
				332	* triggered. If advanced pause filtering is supported and pause filter
				333	* threshold field is set to zero, the filter will operate in the simpler,
				334	* count only mode.
				335	*/
				336
				337	static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
				338	module_param(pause_filter_thresh, ushort, 0444);
				339
				340	static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
				341	module_param(pause_filter_count, ushort, 0444);
				342
				343	/* Default doubles per-vcpu window every exit. */
				344	static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
				345	module_param(pause_filter_count_grow, ushort, 0444);
				346
				347	/* Default resets per-vcpu window every exit to pause_filter_count. */
				348	static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
				349	module_param(pause_filter_count_shrink, ushort, 0444);
				350
				351	/* Default is to compute the maximum so we can never overflow. */
				352	static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
				353	module_param(pause_filter_count_max, ushort, 0444);
				354
				355	/* allow nested paging (virtualized MMU) for all guests */
				356	static int npt = true;
				357	module_param(npt, int, S_IRUGO);
				358
				359	/* allow nested virtualization in KVM/SVM */
				360	static int nested = true;
				361	module_param(nested, int, S_IRUGO);
				362
				363	/* enable / disable AVIC */
				364	static int avic;
				365	#ifdef CONFIG_X86_LOCAL_APIC
				366	module_param(avic, int, S_IRUGO);
				367	#endif
				368
				369	/* enable/disable Virtual VMLOAD VMSAVE */
				370	static int vls = true;
				371	module_param(vls, int, 0444);
				372
				373	/* enable/disable Virtual GIF */
				374	static int vgif = true;
				375	module_param(vgif, int, 0444);
				376
				377	/* enable/disable SEV support */
				378	static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
				379	module_param(sev, int, 0444);
				380
				381	static u8 rsm_ins_bytes[] = "\x0f\xaa";
				382
				383	static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
				384	static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
				385	static void svm_complete_interrupts(struct vcpu_svm *svm);
				386
				387	static int nested_svm_exit_handled(struct vcpu_svm *svm);
				388	static int nested_svm_intercept(struct vcpu_svm *svm);
				389	static int nested_svm_vmexit(struct vcpu_svm *svm);
				390	static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
				391	bool has_error_code, u32 error_code);
				392
				393	enum {
				394	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
				395	pause filter count */
				396	VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
				397	VMCB_ASID, /* ASID */
				398	VMCB_INTR, /* int_ctl, int_vector */
				399	VMCB_NPT, /* npt_en, nCR3, gPAT */
				400	VMCB_CR, /* CR0, CR3, CR4, EFER */
				401	VMCB_DR, /* DR6, DR7 */
				402	VMCB_DT, /* GDT, IDT */
				403	VMCB_SEG, /* CS, DS, SS, ES, CPL */
				404	VMCB_CR2, /* CR2 only */
				405	VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
				406	VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
				407	* AVIC PHYSICAL_TABLE pointer,
				408	* AVIC LOGICAL_TABLE pointer
				409	*/
				410	VMCB_DIRTY_MAX,
				411	};
				412
				413	/* TPR and CR2 are always written before VMRUN */
				414	#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) \| (1U << VMCB_CR2))
				415
				416	#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
				417
				418	static unsigned int max_sev_asid;
				419	static unsigned int min_sev_asid;
				420	static unsigned long *sev_asid_bitmap;
				421	#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
				422
				423	struct enc_region {
				424	struct list_head list;
				425	unsigned long npages;
				426	struct page **pages;
				427	unsigned long uaddr;
				428	unsigned long size;
				429	};
				430
				431
				432	static inline struct kvm_svm to_kvm_svm(struct kvm kvm)
				433	{
				434	return container_of(kvm, struct kvm_svm, kvm);
				435	}
				436
				437	static inline bool svm_sev_enabled(void)
				438	{
				439	return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
				440	}
				441
				442	static inline bool sev_guest(struct kvm *kvm)
				443	{
				444	#ifdef CONFIG_KVM_AMD_SEV
				445	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				446
				447	return sev->active;
				448	#else
				449	return false;
				450	#endif
				451	}
				452
				453	static inline int sev_get_asid(struct kvm *kvm)
				454	{
				455	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				456
				457	return sev->asid;
				458	}
				459
				460	static inline void mark_all_dirty(struct vmcb *vmcb)
				461	{
				462	vmcb->control.clean = 0;
				463	}
				464
				465	static inline void mark_all_clean(struct vmcb *vmcb)
				466	{
				467	vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
				468	& ~VMCB_ALWAYS_DIRTY_MASK;
				469	}
				470
				471	static inline void mark_dirty(struct vmcb *vmcb, int bit)
				472	{
				473	vmcb->control.clean &= ~(1 << bit);
				474	}
				475
				476	static inline struct vcpu_svm to_svm(struct kvm_vcpu vcpu)
				477	{
				478	return container_of(vcpu, struct vcpu_svm, vcpu);
				479	}
				480
				481	static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
				482	{
				483	svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
				484	mark_dirty(svm->vmcb, VMCB_AVIC);
				485	}
				486
				487	static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
				488	{
				489	struct vcpu_svm *svm = to_svm(vcpu);
				490	u64 *entry = svm->avic_physical_id_cache;
				491
				492	if (!entry)
				493	return false;
				494
				495	return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
				496	}
				497
				498	static void recalc_intercepts(struct vcpu_svm *svm)
				499	{
				500	struct vmcb_control_area c, h;
				501	struct nested_state *g;
				502
				503	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
				504
				505	if (!is_guest_mode(&svm->vcpu))
				506	return;
				507
				508	c = &svm->vmcb->control;
				509	h = &svm->nested.hsave->control;
				510	g = &svm->nested;
				511
				512	c->intercept_cr = h->intercept_cr \| g->intercept_cr;
				513	c->intercept_dr = h->intercept_dr \| g->intercept_dr;
				514	c->intercept_exceptions = h->intercept_exceptions \| g->intercept_exceptions;
				515	c->intercept = h->intercept \| g->intercept;
				516	}
				517
				518	static inline struct vmcb get_host_vmcb(struct vcpu_svm svm)
				519	{
				520	if (is_guest_mode(&svm->vcpu))
				521	return svm->nested.hsave;
				522	else
				523	return svm->vmcb;
				524	}
				525
				526	static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
				527	{
				528	struct vmcb *vmcb = get_host_vmcb(svm);
				529
				530	vmcb->control.intercept_cr \|= (1U << bit);
				531
				532	recalc_intercepts(svm);
				533	}
				534
				535	static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
				536	{
				537	struct vmcb *vmcb = get_host_vmcb(svm);
				538
				539	vmcb->control.intercept_cr &= ~(1U << bit);
				540
				541	recalc_intercepts(svm);
				542	}
				543
				544	static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
				545	{
				546	struct vmcb *vmcb = get_host_vmcb(svm);
				547
				548	return vmcb->control.intercept_cr & (1U << bit);
				549	}
				550
				551	static inline void set_dr_intercepts(struct vcpu_svm *svm)
				552	{
				553	struct vmcb *vmcb = get_host_vmcb(svm);
				554
				555	vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
				556	\| (1 << INTERCEPT_DR1_READ)
				557	\| (1 << INTERCEPT_DR2_READ)
				558	\| (1 << INTERCEPT_DR3_READ)
				559	\| (1 << INTERCEPT_DR4_READ)
				560	\| (1 << INTERCEPT_DR5_READ)
				561	\| (1 << INTERCEPT_DR6_READ)
				562	\| (1 << INTERCEPT_DR7_READ)
				563	\| (1 << INTERCEPT_DR0_WRITE)
				564	\| (1 << INTERCEPT_DR1_WRITE)
				565	\| (1 << INTERCEPT_DR2_WRITE)
				566	\| (1 << INTERCEPT_DR3_WRITE)
				567	\| (1 << INTERCEPT_DR4_WRITE)
				568	\| (1 << INTERCEPT_DR5_WRITE)
				569	\| (1 << INTERCEPT_DR6_WRITE)
				570	\| (1 << INTERCEPT_DR7_WRITE);
				571
				572	recalc_intercepts(svm);
				573	}
				574
				575	static inline void clr_dr_intercepts(struct vcpu_svm *svm)
				576	{
				577	struct vmcb *vmcb = get_host_vmcb(svm);
				578
				579	vmcb->control.intercept_dr = 0;
				580
				581	recalc_intercepts(svm);
				582	}
				583
				584	static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
				585	{
				586	struct vmcb *vmcb = get_host_vmcb(svm);
				587
				588	vmcb->control.intercept_exceptions \|= (1U << bit);
				589
				590	recalc_intercepts(svm);
				591	}
				592
				593	static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
				594	{
				595	struct vmcb *vmcb = get_host_vmcb(svm);
				596
				597	vmcb->control.intercept_exceptions &= ~(1U << bit);
				598
				599	recalc_intercepts(svm);
				600	}
				601
				602	static inline void set_intercept(struct vcpu_svm *svm, int bit)
				603	{
				604	struct vmcb *vmcb = get_host_vmcb(svm);
				605
				606	vmcb->control.intercept \|= (1ULL << bit);
				607
				608	recalc_intercepts(svm);
				609	}
				610
				611	static inline void clr_intercept(struct vcpu_svm *svm, int bit)
				612	{
				613	struct vmcb *vmcb = get_host_vmcb(svm);
				614
				615	vmcb->control.intercept &= ~(1ULL << bit);
				616
				617	recalc_intercepts(svm);
				618	}
				619
				620	static inline bool vgif_enabled(struct vcpu_svm *svm)
				621	{
				622	return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
				623	}
				624
				625	static inline void enable_gif(struct vcpu_svm *svm)
				626	{
				627	if (vgif_enabled(svm))
				628	svm->vmcb->control.int_ctl \|= V_GIF_MASK;
				629	else
				630	svm->vcpu.arch.hflags \|= HF_GIF_MASK;
				631	}
				632
				633	static inline void disable_gif(struct vcpu_svm *svm)
				634	{
				635	if (vgif_enabled(svm))
				636	svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
				637	else
				638	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
				639	}
				640
				641	static inline bool gif_set(struct vcpu_svm *svm)
				642	{
				643	if (vgif_enabled(svm))
				644	return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
				645	else
				646	return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
				647	}
				648
				649	static unsigned long iopm_base;
				650
				651	struct kvm_ldttss_desc {
				652	u16 limit0;
				653	u16 base0;
				654	unsigned base1:8, type:5, dpl:2, p:1;
				655	unsigned limit1:4, zero0:3, g:1, base2:8;
				656	u32 base3;
				657	u32 zero1;
				658	} __attribute__((packed));
				659
				660	struct svm_cpu_data {
				661	int cpu;
				662
				663	u64 asid_generation;
				664	u32 max_asid;
				665	u32 next_asid;
				666	u32 min_asid;
				667	struct kvm_ldttss_desc *tss_desc;
				668
				669	struct page *save_area;
				670	struct vmcb *current_vmcb;
				671
				672	/* index = sev_asid, value = vmcb pointer */
				673	struct vmcb **sev_vmcbs;
				674	};
				675
				676	static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
				677
				678	struct svm_init_data {
				679	int cpu;
				680	int r;
				681	};
				682
				683	static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
				684
				685	#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
				686	#define MSRS_RANGE_SIZE 2048
				687	#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
				688
				689	static u32 svm_msrpm_offset(u32 msr)
				690	{
				691	u32 offset;
				692	int i;
				693
				694	for (i = 0; i < NUM_MSR_MAPS; i++) {
				695	if (msr < msrpm_ranges[i] \|\|
				696	msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
				697	continue;
				698
				699	offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
				700	offset += (i * MSRS_RANGE_SIZE); /* add range offset */
				701
				702	/* Now we have the u8 offset - but need the u32 offset */
				703	return offset / 4;
				704	}
				705
				706	/* MSR not in any range */
				707	return MSR_INVALID;
				708	}
				709
				710	#define MAX_INST_SIZE 15
				711
				712	static inline void clgi(void)
				713	{
				714	asm volatile (__ex(SVM_CLGI));
				715	}
				716
				717	static inline void stgi(void)
				718	{
				719	asm volatile (__ex(SVM_STGI));
				720	}
				721
				722	static inline void invlpga(unsigned long addr, u32 asid)
				723	{
				724	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
				725	}
				726
				727	static int get_npt_level(struct kvm_vcpu *vcpu)
				728	{
				729	#ifdef CONFIG_X86_64
				730	return PT64_ROOT_4LEVEL;
				731	#else
				732	return PT32E_ROOT_LEVEL;
				733	#endif
				734	}
				735
				736	static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
				737	{
				738	vcpu->arch.efer = efer;
				739
				740	if (!npt_enabled) {
				741	/* Shadow paging assumes NX to be available. */
				742	efer \|= EFER_NX;
				743
				744	if (!(efer & EFER_LMA))
				745	efer &= ~EFER_LME;
				746	}
				747
				748	to_svm(vcpu)->vmcb->save.efer = efer \| EFER_SVME;
				749	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
				750	}
				751
				752	static int is_external_interrupt(u32 info)
				753	{
				754	info &= SVM_EVTINJ_TYPE_MASK \| SVM_EVTINJ_VALID;
				755	return info == (SVM_EVTINJ_VALID \| SVM_EVTINJ_TYPE_INTR);
				756	}
				757
				758	static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
				759	{
				760	struct vcpu_svm *svm = to_svm(vcpu);
				761	u32 ret = 0;
				762
				763	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
				764	ret = KVM_X86_SHADOW_INT_STI \| KVM_X86_SHADOW_INT_MOV_SS;
				765	return ret;
				766	}
				767
				768	static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
				769	{
				770	struct vcpu_svm *svm = to_svm(vcpu);
				771
				772	if (mask == 0)
				773	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
				774	else
				775	svm->vmcb->control.int_state \|= SVM_INTERRUPT_SHADOW_MASK;
				776
				777	}
				778
				779	static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
				780	{
				781	struct vcpu_svm *svm = to_svm(vcpu);
				782
				783	if (svm->vmcb->control.next_rip != 0) {
				784	WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
				785	svm->next_rip = svm->vmcb->control.next_rip;
				786	}
				787
				788	if (!svm->next_rip) {
				789	if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
				790	EMULATE_DONE)
				791	printk(KERN_DEBUG "%s: NOP\n", __func__);
				792	return;
				793	}
				794	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
				795	printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
				796	__func__, kvm_rip_read(vcpu), svm->next_rip);
				797
				798	kvm_rip_write(vcpu, svm->next_rip);
				799	svm_set_interrupt_shadow(vcpu, 0);
				800	}
				801
				802	static void svm_queue_exception(struct kvm_vcpu *vcpu)
				803	{
				804	struct vcpu_svm *svm = to_svm(vcpu);
				805	unsigned nr = vcpu->arch.exception.nr;
				806	bool has_error_code = vcpu->arch.exception.has_error_code;
				807	bool reinject = vcpu->arch.exception.injected;
				808	u32 error_code = vcpu->arch.exception.error_code;
				809
				810	/*
				811	* If we are within a nested VM we'd better #VMEXIT and let the guest
				812	* handle the exception
				813	*/
				814	if (!reinject &&
				815	nested_svm_check_exception(svm, nr, has_error_code, error_code))
				816	return;
				817
				818	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
				819	unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
				820
				821	/*
				822	* For guest debugging where we have to reinject #BP if some
				823	* INT3 is guest-owned:
				824	* Emulate nRIP by moving RIP forward. Will fail if injection
				825	* raises a fault that is not intercepted. Still better than
				826	* failing in all cases.
				827	*/
				828	skip_emulated_instruction(&svm->vcpu);
				829	rip = kvm_rip_read(&svm->vcpu);
				830	svm->int3_rip = rip + svm->vmcb->save.cs.base;
				831	svm->int3_injected = rip - old_rip;
				832	}
				833
				834	svm->vmcb->control.event_inj = nr
				835	\| SVM_EVTINJ_VALID
				836	\| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
				837	\| SVM_EVTINJ_TYPE_EXEPT;
				838	svm->vmcb->control.event_inj_err = error_code;
				839	}
				840
				841	static void svm_init_erratum_383(void)
				842	{
				843	u32 low, high;
				844	int err;
				845	u64 val;
				846
				847	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
				848	return;
				849
				850	/* Use _safe variants to not break nested virtualization */
				851	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
				852	if (err)
				853	return;
				854
				855	val \|= (1ULL << 47);
				856
				857	low = lower_32_bits(val);
				858	high = upper_32_bits(val);
				859
				860	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
				861
				862	erratum_383_found = true;
				863	}
				864
				865	static void svm_init_osvw(struct kvm_vcpu *vcpu)
				866	{
				867	/*
				868	* Guests should see errata 400 and 415 as fixed (assuming that
				869	* HLT and IO instructions are intercepted).
				870	*/
				871	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
				872	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
				873
				874	/*
				875	* By increasing VCPU's osvw.length to 3 we are telling the guest that
				876	* all osvw.status bits inside that length, including bit 0 (which is
				877	* reserved for erratum 298), are valid. However, if host processor's
				878	* osvw_len is 0 then osvw_status[0] carries no information. We need to
				879	* be conservative here and therefore we tell the guest that erratum 298
				880	* is present (because we really don't know).
				881	*/
				882	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
				883	vcpu->arch.osvw.status \|= 1;
				884	}
				885
				886	static int has_svm(void)
				887	{
				888	const char *msg;
				889
				890	if (!cpu_has_svm(&msg)) {
				891	printk(KERN_INFO "has_svm: %s\n", msg);
				892	return 0;
				893	}
				894
				895	return 1;
				896	}
				897
				898	static void svm_hardware_disable(void)
				899	{
				900	/* Make sure we clean up behind us */
				901	if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
				902	wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
				903
				904	cpu_svm_disable();
				905
				906	amd_pmu_disable_virt();
				907	}
				908
				909	static int svm_hardware_enable(void)
				910	{
				911
				912	struct svm_cpu_data *sd;
				913	uint64_t efer;
				914	struct desc_struct *gdt;
				915	int me = raw_smp_processor_id();
				916
				917	rdmsrl(MSR_EFER, efer);
				918	if (efer & EFER_SVME)
				919	return -EBUSY;
				920
				921	if (!has_svm()) {
				922	pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
				923	return -EINVAL;
				924	}
				925	sd = per_cpu(svm_data, me);
				926	if (!sd) {
				927	pr_err("%s: svm_data is NULL on %d\n", __func__, me);
				928	return -EINVAL;
				929	}
				930
				931	sd->asid_generation = 1;
				932	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
				933	sd->next_asid = sd->max_asid + 1;
				934	sd->min_asid = max_sev_asid + 1;
				935
				936	gdt = get_current_gdt_rw();
				937	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
				938
				939	wrmsrl(MSR_EFER, efer \| EFER_SVME);
				940
				941	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
				942
				943	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
				944	wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
				945	__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
				946	}
				947
				948
				949	/*
				950	* Get OSVW bits.
				951	*
				952	* Note that it is possible to have a system with mixed processor
				953	* revisions and therefore different OSVW bits. If bits are not the same
				954	* on different processors then choose the worst case (i.e. if erratum
				955	* is present on one processor and not on another then assume that the
				956	* erratum is present everywhere).
				957	*/
				958	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
				959	uint64_t len, status = 0;
				960	int err;
				961
				962	len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
				963	if (!err)
				964	status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
				965	&err);
				966
				967	if (err)
				968	osvw_status = osvw_len = 0;
				969	else {
				970	if (len < osvw_len)
				971	osvw_len = len;
				972	osvw_status \|= status;
				973	osvw_status &= (1ULL << osvw_len) - 1;
				974	}
				975	} else
				976	osvw_status = osvw_len = 0;
				977
				978	svm_init_erratum_383();
				979
				980	amd_pmu_enable_virt();
				981
				982	return 0;
				983	}
				984
				985	static void svm_cpu_uninit(int cpu)
				986	{
				987	struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
				988
				989	if (!sd)
				990	return;
				991
				992	per_cpu(svm_data, raw_smp_processor_id()) = NULL;
				993	kfree(sd->sev_vmcbs);
				994	__free_page(sd->save_area);
				995	kfree(sd);
				996	}
				997
				998	static int svm_cpu_init(int cpu)
				999	{
				1000	struct svm_cpu_data *sd;
				1001	int r;
				1002
				1003	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
				1004	if (!sd)
				1005	return -ENOMEM;
				1006	sd->cpu = cpu;
				1007	r = -ENOMEM;
				1008	sd->save_area = alloc_page(GFP_KERNEL);
				1009	if (!sd->save_area)
				1010	goto err_1;
				1011
				1012	if (svm_sev_enabled()) {
				1013	r = -ENOMEM;
				1014	sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
				1015	sizeof(void *),
				1016	GFP_KERNEL);
				1017	if (!sd->sev_vmcbs)
				1018	goto err_1;
				1019	}
				1020
				1021	per_cpu(svm_data, cpu) = sd;
				1022
				1023	return 0;
				1024
				1025	err_1:
				1026	kfree(sd);
				1027	return r;
				1028
				1029	}
				1030
				1031	static bool valid_msr_intercept(u32 index)
				1032	{
				1033	int i;
				1034
				1035	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
				1036	if (direct_access_msrs[i].index == index)
				1037	return true;
				1038
				1039	return false;
				1040	}
				1041
				1042	static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
				1043	{
				1044	u8 bit_write;
				1045	unsigned long tmp;
				1046	u32 offset;
				1047	u32 *msrpm;
				1048
				1049	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
				1050	to_svm(vcpu)->msrpm;
				1051
				1052	offset = svm_msrpm_offset(msr);
				1053	bit_write = 2 * (msr & 0x0f) + 1;
				1054	tmp = msrpm[offset];
				1055
				1056	BUG_ON(offset == MSR_INVALID);
				1057
				1058	return !!test_bit(bit_write, &tmp);
				1059	}
				1060
				1061	static void set_msr_interception(u32 *msrpm, unsigned msr,
				1062	int read, int write)
				1063	{
				1064	u8 bit_read, bit_write;
				1065	unsigned long tmp;
				1066	u32 offset;
				1067
				1068	/*
				1069	* If this warning triggers extend the direct_access_msrs list at the
				1070	* beginning of the file
				1071	*/
				1072	WARN_ON(!valid_msr_intercept(msr));
				1073
				1074	offset = svm_msrpm_offset(msr);
				1075	bit_read = 2 * (msr & 0x0f);
				1076	bit_write = 2 * (msr & 0x0f) + 1;
				1077	tmp = msrpm[offset];
				1078
				1079	BUG_ON(offset == MSR_INVALID);
				1080
				1081	read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
				1082	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
				1083
				1084	msrpm[offset] = tmp;
				1085	}
				1086
				1087	static void svm_vcpu_init_msrpm(u32 *msrpm)
				1088	{
				1089	int i;
				1090
				1091	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
				1092
				1093	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
				1094	if (!direct_access_msrs[i].always)
				1095	continue;
				1096
				1097	set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
				1098	}
				1099	}
				1100
				1101	static void add_msr_offset(u32 offset)
				1102	{
				1103	int i;
				1104
				1105	for (i = 0; i < MSRPM_OFFSETS; ++i) {
				1106
				1107	/* Offset already in list? */
				1108	if (msrpm_offsets[i] == offset)
				1109	return;
				1110
				1111	/* Slot used by another offset? */
				1112	if (msrpm_offsets[i] != MSR_INVALID)
				1113	continue;
				1114
				1115	/* Add offset to list */
				1116	msrpm_offsets[i] = offset;
				1117
				1118	return;
				1119	}
				1120
				1121	/*
				1122	* If this BUG triggers the msrpm_offsets table has an overflow. Just
				1123	* increase MSRPM_OFFSETS in this case.
				1124	*/
				1125	BUG();
				1126	}
				1127
				1128	static void init_msrpm_offsets(void)
				1129	{
				1130	int i;
				1131
				1132	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
				1133
				1134	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
				1135	u32 offset;
				1136
				1137	offset = svm_msrpm_offset(direct_access_msrs[i].index);
				1138	BUG_ON(offset == MSR_INVALID);
				1139
				1140	add_msr_offset(offset);
				1141	}
				1142	}
				1143
				1144	static void svm_enable_lbrv(struct vcpu_svm *svm)
				1145	{
				1146	u32 *msrpm = svm->msrpm;
				1147
				1148	svm->vmcb->control.virt_ext \|= LBR_CTL_ENABLE_MASK;
				1149	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
				1150	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
				1151	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
				1152	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
				1153	}
				1154
				1155	static void svm_disable_lbrv(struct vcpu_svm *svm)
				1156	{
				1157	u32 *msrpm = svm->msrpm;
				1158
				1159	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
				1160	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
				1161	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
				1162	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
				1163	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
				1164	}
				1165
				1166	static void disable_nmi_singlestep(struct vcpu_svm *svm)
				1167	{
				1168	svm->nmi_singlestep = false;
				1169
				1170	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
				1171	/* Clear our flags if they were not set by the guest */
				1172	if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
				1173	svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
				1174	if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
				1175	svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
				1176	}
				1177	}
				1178
				1179	/* Note:
				1180	* This hash table is used to map VM_ID to a struct kvm_svm,
				1181	* when handling AMD IOMMU GALOG notification to schedule in
				1182	* a particular vCPU.
				1183	*/
				1184	#define SVM_VM_DATA_HASH_BITS 8
				1185	static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
				1186	static u32 next_vm_id = 0;
				1187	static bool next_vm_id_wrapped = 0;
				1188	static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
				1189
				1190	/* Note:
				1191	* This function is called from IOMMU driver to notify
				1192	* SVM to schedule in a particular vCPU of a particular VM.
				1193	*/
				1194	static int avic_ga_log_notifier(u32 ga_tag)
				1195	{
				1196	unsigned long flags;
				1197	struct kvm_svm *kvm_svm;
				1198	struct kvm_vcpu *vcpu = NULL;
				1199	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
				1200	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
				1201
				1202	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
				1203
				1204	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
				1205	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
				1206	if (kvm_svm->avic_vm_id != vm_id)
				1207	continue;
				1208	vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
				1209	break;
				1210	}
				1211	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
				1212
				1213	/* Note:
				1214	* At this point, the IOMMU should have already set the pending
				1215	* bit in the vAPIC backing page. So, we just need to schedule
				1216	* in the vcpu.
				1217	*/
				1218	if (vcpu)
				1219	kvm_vcpu_wake_up(vcpu);
				1220
				1221	return 0;
				1222	}
				1223
				1224	static __init int sev_hardware_setup(void)
				1225	{
				1226	struct sev_user_data_status *status;
				1227	int rc;
				1228
				1229	/* Maximum number of encrypted guests supported simultaneously */
				1230	max_sev_asid = cpuid_ecx(0x8000001F);
				1231
				1232	if (!max_sev_asid)
				1233	return 1;
				1234
				1235	/* Minimum ASID value that should be used for SEV guest */
				1236	min_sev_asid = cpuid_edx(0x8000001F);
				1237
				1238	/* Initialize SEV ASID bitmap */
				1239	sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
				1240	if (!sev_asid_bitmap)
				1241	return 1;
				1242
				1243	status = kmalloc(sizeof(*status), GFP_KERNEL);
				1244	if (!status)
				1245	return 1;
				1246
				1247	/*
				1248	* Check SEV platform status.
				1249	*
				1250	* PLATFORM_STATUS can be called in any state, if we failed to query
				1251	* the PLATFORM status then either PSP firmware does not support SEV
				1252	* feature or SEV firmware is dead.
				1253	*/
				1254	rc = sev_platform_status(status, NULL);
				1255	if (rc)
				1256	goto err;
				1257
				1258	pr_info("SEV supported\n");
				1259
				1260	err:
				1261	kfree(status);
				1262	return rc;
				1263	}
				1264
				1265	static void grow_ple_window(struct kvm_vcpu *vcpu)
				1266	{
				1267	struct vcpu_svm *svm = to_svm(vcpu);
				1268	struct vmcb_control_area *control = &svm->vmcb->control;
				1269	int old = control->pause_filter_count;
				1270
				1271	control->pause_filter_count = __grow_ple_window(old,
				1272	pause_filter_count,
				1273	pause_filter_count_grow,
				1274	pause_filter_count_max);
				1275
				1276	if (control->pause_filter_count != old)
				1277	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
				1278
				1279	trace_kvm_ple_window_grow(vcpu->vcpu_id,
				1280	control->pause_filter_count, old);
				1281	}
				1282
				1283	static void shrink_ple_window(struct kvm_vcpu *vcpu)
				1284	{
				1285	struct vcpu_svm *svm = to_svm(vcpu);
				1286	struct vmcb_control_area *control = &svm->vmcb->control;
				1287	int old = control->pause_filter_count;
				1288
				1289	control->pause_filter_count =
				1290	__shrink_ple_window(old,
				1291	pause_filter_count,
				1292	pause_filter_count_shrink,
				1293	pause_filter_count);
				1294	if (control->pause_filter_count != old)
				1295	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
				1296
				1297	trace_kvm_ple_window_shrink(vcpu->vcpu_id,
				1298	control->pause_filter_count, old);
				1299	}
				1300
				1301	static __init int svm_hardware_setup(void)
				1302	{
				1303	int cpu;
				1304	struct page *iopm_pages;
				1305	void *iopm_va;
				1306	int r;
				1307
				1308	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
				1309
				1310	if (!iopm_pages)
				1311	return -ENOMEM;
				1312
				1313	iopm_va = page_address(iopm_pages);
				1314	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
				1315	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
				1316
				1317	init_msrpm_offsets();
				1318
				1319	if (boot_cpu_has(X86_FEATURE_NX))
				1320	kvm_enable_efer_bits(EFER_NX);
				1321
				1322	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
				1323	kvm_enable_efer_bits(EFER_FFXSR);
				1324
				1325	if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
				1326	kvm_has_tsc_control = true;
				1327	kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
				1328	kvm_tsc_scaling_ratio_frac_bits = 32;
				1329	}
				1330
				1331	/* Check for pause filtering support */
				1332	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
				1333	pause_filter_count = 0;
				1334	pause_filter_thresh = 0;
				1335	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
				1336	pause_filter_thresh = 0;
				1337	}
				1338
				1339	if (nested) {
				1340	printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
				1341	kvm_enable_efer_bits(EFER_SVME \| EFER_LMSLE);
				1342	}
				1343
				1344	if (sev) {
				1345	if (boot_cpu_has(X86_FEATURE_SEV) &&
				1346	IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
				1347	r = sev_hardware_setup();
				1348	if (r)
				1349	sev = false;
				1350	} else {
				1351	sev = false;
				1352	}
				1353	}
				1354
				1355	for_each_possible_cpu(cpu) {
				1356	r = svm_cpu_init(cpu);
				1357	if (r)
				1358	goto err;
				1359	}
				1360
				1361	if (!boot_cpu_has(X86_FEATURE_NPT))
				1362	npt_enabled = false;
				1363
				1364	if (npt_enabled && !npt) {
				1365	printk(KERN_INFO "kvm: Nested Paging disabled\n");
				1366	npt_enabled = false;
				1367	}
				1368
				1369	if (npt_enabled) {
				1370	printk(KERN_INFO "kvm: Nested Paging enabled\n");
				1371	kvm_enable_tdp();
				1372	} else
				1373	kvm_disable_tdp();
				1374
				1375	if (avic) {
				1376	if (!npt_enabled \|\|
				1377	!boot_cpu_has(X86_FEATURE_AVIC) \|\|
				1378	!IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
				1379	avic = false;
				1380	} else {
				1381	pr_info("AVIC enabled\n");
				1382
				1383	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
				1384	}
				1385	}
				1386
				1387	if (vls) {
				1388	if (!npt_enabled \|\|
				1389	!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) \|\|
				1390	!IS_ENABLED(CONFIG_X86_64)) {
				1391	vls = false;
				1392	} else {
				1393	pr_info("Virtual VMLOAD VMSAVE supported\n");
				1394	}
				1395	}
				1396
				1397	if (vgif) {
				1398	if (!boot_cpu_has(X86_FEATURE_VGIF))
				1399	vgif = false;
				1400	else
				1401	pr_info("Virtual GIF supported\n");
				1402	}
				1403
				1404	return 0;
				1405
				1406	err:
				1407	__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
				1408	iopm_base = 0;
				1409	return r;
				1410	}
				1411
				1412	static __exit void svm_hardware_unsetup(void)
				1413	{
				1414	int cpu;
				1415
				1416	if (svm_sev_enabled())
				1417	bitmap_free(sev_asid_bitmap);
				1418
				1419	for_each_possible_cpu(cpu)
				1420	svm_cpu_uninit(cpu);
				1421
				1422	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
				1423	iopm_base = 0;
				1424	}
				1425
				1426	static void init_seg(struct vmcb_seg *seg)
				1427	{
				1428	seg->selector = 0;
				1429	seg->attrib = SVM_SELECTOR_P_MASK \| SVM_SELECTOR_S_MASK \|
				1430	SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
				1431	seg->limit = 0xffff;
				1432	seg->base = 0;
				1433	}
				1434
				1435	static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
				1436	{
				1437	seg->selector = 0;
				1438	seg->attrib = SVM_SELECTOR_P_MASK \| type;
				1439	seg->limit = 0xffff;
				1440	seg->base = 0;
				1441	}
				1442
				1443	static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
				1444	{
				1445	struct vcpu_svm *svm = to_svm(vcpu);
				1446
				1447	if (is_guest_mode(vcpu))
				1448	return svm->nested.hsave->control.tsc_offset;
				1449
				1450	return vcpu->arch.tsc_offset;
				1451	}
				1452
				1453	static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
				1454	{
				1455	struct vcpu_svm *svm = to_svm(vcpu);
				1456	u64 g_tsc_offset = 0;
				1457
				1458	if (is_guest_mode(vcpu)) {
				1459	/* Write L1's TSC offset. */
				1460	g_tsc_offset = svm->vmcb->control.tsc_offset -
				1461	svm->nested.hsave->control.tsc_offset;
				1462	svm->nested.hsave->control.tsc_offset = offset;
				1463	} else
				1464	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
				1465	svm->vmcb->control.tsc_offset,
				1466	offset);
				1467
				1468	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
				1469
				1470	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
				1471	return svm->vmcb->control.tsc_offset;
				1472	}
				1473
				1474	static void avic_init_vmcb(struct vcpu_svm *svm)
				1475	{
				1476	struct vmcb *vmcb = svm->vmcb;
				1477	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
				1478	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
				1479	phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
				1480	phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
				1481
				1482	vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
				1483	vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
				1484	vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
				1485	vmcb->control.avic_physical_id \|= AVIC_MAX_PHYSICAL_ID_COUNT;
				1486	vmcb->control.int_ctl \|= AVIC_ENABLE_MASK;
				1487	}
				1488
				1489	static void init_vmcb(struct vcpu_svm *svm)
				1490	{
				1491	struct vmcb_control_area *control = &svm->vmcb->control;
				1492	struct vmcb_save_area *save = &svm->vmcb->save;
				1493
				1494	svm->vcpu.arch.hflags = 0;
				1495
				1496	set_cr_intercept(svm, INTERCEPT_CR0_READ);
				1497	set_cr_intercept(svm, INTERCEPT_CR3_READ);
				1498	set_cr_intercept(svm, INTERCEPT_CR4_READ);
				1499	set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
				1500	set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
				1501	set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
				1502	if (!kvm_vcpu_apicv_active(&svm->vcpu))
				1503	set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
				1504
				1505	set_dr_intercepts(svm);
				1506
				1507	set_exception_intercept(svm, PF_VECTOR);
				1508	set_exception_intercept(svm, UD_VECTOR);
				1509	set_exception_intercept(svm, MC_VECTOR);
				1510	set_exception_intercept(svm, AC_VECTOR);
				1511	set_exception_intercept(svm, DB_VECTOR);
				1512	/*
				1513	* Guest access to VMware backdoor ports could legitimately
				1514	* trigger #GP because of TSS I/O permission bitmap.
				1515	* We intercept those #GP and allow access to them anyway
				1516	* as VMware does.
				1517	*/
				1518	if (enable_vmware_backdoor)
				1519	set_exception_intercept(svm, GP_VECTOR);
				1520
				1521	set_intercept(svm, INTERCEPT_INTR);
				1522	set_intercept(svm, INTERCEPT_NMI);
				1523	set_intercept(svm, INTERCEPT_SMI);
				1524	set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
				1525	set_intercept(svm, INTERCEPT_RDPMC);
				1526	set_intercept(svm, INTERCEPT_CPUID);
				1527	set_intercept(svm, INTERCEPT_INVD);
				1528	set_intercept(svm, INTERCEPT_INVLPG);
				1529	set_intercept(svm, INTERCEPT_INVLPGA);
				1530	set_intercept(svm, INTERCEPT_IOIO_PROT);
				1531	set_intercept(svm, INTERCEPT_MSR_PROT);
				1532	set_intercept(svm, INTERCEPT_TASK_SWITCH);
				1533	set_intercept(svm, INTERCEPT_SHUTDOWN);
				1534	set_intercept(svm, INTERCEPT_VMRUN);
				1535	set_intercept(svm, INTERCEPT_VMMCALL);
				1536	set_intercept(svm, INTERCEPT_VMLOAD);
				1537	set_intercept(svm, INTERCEPT_VMSAVE);
				1538	set_intercept(svm, INTERCEPT_STGI);
				1539	set_intercept(svm, INTERCEPT_CLGI);
				1540	set_intercept(svm, INTERCEPT_SKINIT);
				1541	set_intercept(svm, INTERCEPT_WBINVD);
				1542	set_intercept(svm, INTERCEPT_XSETBV);
				1543	set_intercept(svm, INTERCEPT_RSM);
				1544
				1545	if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
				1546	set_intercept(svm, INTERCEPT_MONITOR);
				1547	set_intercept(svm, INTERCEPT_MWAIT);
				1548	}
				1549
				1550	if (!kvm_hlt_in_guest(svm->vcpu.kvm))
				1551	set_intercept(svm, INTERCEPT_HLT);
				1552
				1553	control->iopm_base_pa = __sme_set(iopm_base);
				1554	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
				1555	control->int_ctl = V_INTR_MASKING_MASK;
				1556
				1557	init_seg(&save->es);
				1558	init_seg(&save->ss);
				1559	init_seg(&save->ds);
				1560	init_seg(&save->fs);
				1561	init_seg(&save->gs);
				1562
				1563	save->cs.selector = 0xf000;
				1564	save->cs.base = 0xffff0000;
				1565	/* Executable/Readable Code Segment */
				1566	save->cs.attrib = SVM_SELECTOR_READ_MASK \| SVM_SELECTOR_P_MASK \|
				1567	SVM_SELECTOR_S_MASK \| SVM_SELECTOR_CODE_MASK;
				1568	save->cs.limit = 0xffff;
				1569
				1570	save->gdtr.limit = 0xffff;
				1571	save->idtr.limit = 0xffff;
				1572
				1573	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
				1574	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
				1575
				1576	svm_set_efer(&svm->vcpu, 0);
				1577	save->dr6 = 0xffff0ff0;
				1578	kvm_set_rflags(&svm->vcpu, 2);
				1579	save->rip = 0x0000fff0;
				1580	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
				1581
				1582	/*
				1583	* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
				1584	* It also updates the guest-visible cr0 value.
				1585	*/
				1586	svm_set_cr0(&svm->vcpu, X86_CR0_NW \| X86_CR0_CD \| X86_CR0_ET);
				1587	kvm_mmu_reset_context(&svm->vcpu);
				1588
				1589	save->cr4 = X86_CR4_PAE;
				1590	/* rdx = ?? */
				1591
				1592	if (npt_enabled) {
				1593	/* Setup VMCB for Nested Paging */
				1594	control->nested_ctl \|= SVM_NESTED_CTL_NP_ENABLE;
				1595	clr_intercept(svm, INTERCEPT_INVLPG);
				1596	clr_exception_intercept(svm, PF_VECTOR);
				1597	clr_cr_intercept(svm, INTERCEPT_CR3_READ);
				1598	clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
				1599	save->g_pat = svm->vcpu.arch.pat;
				1600	save->cr3 = 0;
				1601	save->cr4 = 0;
				1602	}
				1603	svm->asid_generation = 0;
				1604
				1605	svm->nested.vmcb = 0;
				1606	svm->vcpu.arch.hflags = 0;
				1607
				1608	if (pause_filter_count) {
				1609	control->pause_filter_count = pause_filter_count;
				1610	if (pause_filter_thresh)
				1611	control->pause_filter_thresh = pause_filter_thresh;
				1612	set_intercept(svm, INTERCEPT_PAUSE);
				1613	} else {
				1614	clr_intercept(svm, INTERCEPT_PAUSE);
				1615	}
				1616
				1617	if (kvm_vcpu_apicv_active(&svm->vcpu))
				1618	avic_init_vmcb(svm);
				1619
				1620	/*
				1621	* If hardware supports Virtual VMLOAD VMSAVE then enable it
				1622	* in VMCB and clear intercepts to avoid #VMEXIT.
				1623	*/
				1624	if (vls) {
				1625	clr_intercept(svm, INTERCEPT_VMLOAD);
				1626	clr_intercept(svm, INTERCEPT_VMSAVE);
				1627	svm->vmcb->control.virt_ext \|= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
				1628	}
				1629
				1630	if (vgif) {
				1631	clr_intercept(svm, INTERCEPT_STGI);
				1632	clr_intercept(svm, INTERCEPT_CLGI);
				1633	svm->vmcb->control.int_ctl \|= V_GIF_ENABLE_MASK;
				1634	}
				1635
				1636	if (sev_guest(svm->vcpu.kvm)) {
				1637	svm->vmcb->control.nested_ctl \|= SVM_NESTED_CTL_SEV_ENABLE;
				1638	clr_exception_intercept(svm, UD_VECTOR);
				1639	}
				1640
				1641	mark_all_dirty(svm->vmcb);
				1642
				1643	enable_gif(svm);
				1644
				1645	}
				1646
				1647	static u64 avic_get_physical_id_entry(struct kvm_vcpu vcpu,
				1648	unsigned int index)
				1649	{
				1650	u64 *avic_physical_id_table;
				1651	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
				1652
				1653	if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
				1654	return NULL;
				1655
				1656	avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
				1657
				1658	return &avic_physical_id_table[index];
				1659	}
				1660
				1661	/**
				1662	* Note:
				1663	* AVIC hardware walks the nested page table to check permissions,
				1664	* but does not use the SPA address specified in the leaf page
				1665	* table entry since it uses address in the AVIC_BACKING_PAGE pointer
				1666	* field of the VMCB. Therefore, we set up the
				1667	* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
				1668	*/
				1669	static int avic_init_access_page(struct kvm_vcpu *vcpu)
				1670	{
				1671	struct kvm *kvm = vcpu->kvm;
				1672	int ret = 0;
				1673
				1674	mutex_lock(&kvm->slots_lock);
				1675	if (kvm->arch.apic_access_page_done)
				1676	goto out;
				1677
				1678	ret = __x86_set_memory_region(kvm,
				1679	APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
				1680	APIC_DEFAULT_PHYS_BASE,
				1681	PAGE_SIZE);
				1682	if (ret)
				1683	goto out;
				1684
				1685	kvm->arch.apic_access_page_done = true;
				1686	out:
				1687	mutex_unlock(&kvm->slots_lock);
				1688	return ret;
				1689	}
				1690
				1691	static int avic_init_backing_page(struct kvm_vcpu *vcpu)
				1692	{
				1693	int ret;
				1694	u64 *entry, new_entry;
				1695	int id = vcpu->vcpu_id;
				1696	struct vcpu_svm *svm = to_svm(vcpu);
				1697
				1698	ret = avic_init_access_page(vcpu);
				1699	if (ret)
				1700	return ret;
				1701
				1702	if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
				1703	return -EINVAL;
				1704
				1705	if (!svm->vcpu.arch.apic->regs)
				1706	return -EINVAL;
				1707
				1708	svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
				1709
				1710	/* Setting AVIC backing page address in the phy APIC ID table */
				1711	entry = avic_get_physical_id_entry(vcpu, id);
				1712	if (!entry)
				1713	return -EINVAL;
				1714
				1715	new_entry = READ_ONCE(*entry);
				1716	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
				1717	AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) \|
				1718	AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
				1719	WRITE_ONCE(*entry, new_entry);
				1720
				1721	svm->avic_physical_id_cache = entry;
				1722
				1723	return 0;
				1724	}
				1725
				1726	static void __sev_asid_free(int asid)
				1727	{
				1728	struct svm_cpu_data *sd;
				1729	int cpu, pos;
				1730
				1731	pos = asid - 1;
				1732	clear_bit(pos, sev_asid_bitmap);
				1733
				1734	for_each_possible_cpu(cpu) {
				1735	sd = per_cpu(svm_data, cpu);
				1736	sd->sev_vmcbs[pos] = NULL;
				1737	}
				1738	}
				1739
				1740	static void sev_asid_free(struct kvm *kvm)
				1741	{
				1742	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				1743
				1744	__sev_asid_free(sev->asid);
				1745	}
				1746
				1747	static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
				1748	{
				1749	struct sev_data_decommission *decommission;
				1750	struct sev_data_deactivate *data;
				1751
				1752	if (!handle)
				1753	return;
				1754
				1755	data = kzalloc(sizeof(*data), GFP_KERNEL);
				1756	if (!data)
				1757	return;
				1758
				1759	/* deactivate handle */
				1760	data->handle = handle;
				1761	sev_guest_deactivate(data, NULL);
				1762
				1763	wbinvd_on_all_cpus();
				1764	sev_guest_df_flush(NULL);
				1765	kfree(data);
				1766
				1767	decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
				1768	if (!decommission)
				1769	return;
				1770
				1771	/* decommission handle */
				1772	decommission->handle = handle;
				1773	sev_guest_decommission(decommission, NULL);
				1774
				1775	kfree(decommission);
				1776	}
				1777
				1778	static struct page *sev_pin_memory(struct kvm kvm, unsigned long uaddr,
				1779	unsigned long ulen, unsigned long *n,
				1780	int write)
				1781	{
				1782	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				1783	unsigned long npages, npinned, size;
				1784	unsigned long locked, lock_limit;
				1785	struct page **pages;
				1786	unsigned long first, last;
				1787
				1788	if (ulen == 0 \|\| uaddr + ulen < uaddr)
				1789	return NULL;
				1790
				1791	/* Calculate number of pages. */
				1792	first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
				1793	last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
				1794	npages = (last - first + 1);
				1795
				1796	locked = sev->pages_locked + npages;
				1797	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				1798	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
				1799	pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
				1800	return NULL;
				1801	}
				1802
				1803	/* Avoid using vmalloc for smaller buffers. */
				1804	size = npages * sizeof(struct page *);
				1805	if (size > PAGE_SIZE)
				1806	pages = vmalloc(size);
				1807	else
				1808	pages = kmalloc(size, GFP_KERNEL);
				1809
				1810	if (!pages)
				1811	return NULL;
				1812
				1813	/* Pin the user virtual address. */
				1814	npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
				1815	if (npinned != npages) {
				1816	pr_err("SEV: Failure locking %lu pages.\n", npages);
				1817	goto err;
				1818	}
				1819
				1820	*n = npages;
				1821	sev->pages_locked = locked;
				1822
				1823	return pages;
				1824
				1825	err:
				1826	if (npinned > 0)
				1827	release_pages(pages, npinned);
				1828
				1829	kvfree(pages);
				1830	return NULL;
				1831	}
				1832
				1833	static void sev_unpin_memory(struct kvm kvm, struct page *pages,
				1834	unsigned long npages)
				1835	{
				1836	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				1837
				1838	release_pages(pages, npages);
				1839	kvfree(pages);
				1840	sev->pages_locked -= npages;
				1841	}
				1842
				1843	static void sev_clflush_pages(struct page *pages[], unsigned long npages)
				1844	{
				1845	uint8_t *page_virtual;
				1846	unsigned long i;
				1847
				1848	if (npages == 0 \|\| pages == NULL)
				1849	return;
				1850
				1851	for (i = 0; i < npages; i++) {
				1852	page_virtual = kmap_atomic(pages[i]);
				1853	clflush_cache_range(page_virtual, PAGE_SIZE);
				1854	kunmap_atomic(page_virtual);
				1855	}
				1856	}
				1857
				1858	static void __unregister_enc_region_locked(struct kvm *kvm,
				1859	struct enc_region *region)
				1860	{
				1861	/*
				1862	* The guest may change the memory encryption attribute from C=0 -> C=1
				1863	* or vice versa for this memory range. Lets make sure caches are
				1864	* flushed to ensure that guest data gets written into memory with
				1865	* correct C-bit.
				1866	*/
				1867	sev_clflush_pages(region->pages, region->npages);
				1868
				1869	sev_unpin_memory(kvm, region->pages, region->npages);
				1870	list_del(&region->list);
				1871	kfree(region);
				1872	}
				1873
				1874	static struct kvm *svm_vm_alloc(void)
				1875	{
				1876	struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
				1877	return &kvm_svm->kvm;
				1878	}
				1879
				1880	static void svm_vm_free(struct kvm *kvm)
				1881	{
				1882	vfree(to_kvm_svm(kvm));
				1883	}
				1884
				1885	static void sev_vm_destroy(struct kvm *kvm)
				1886	{
				1887	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				1888	struct list_head *head = &sev->regions_list;
				1889	struct list_head pos, q;
				1890
				1891	if (!sev_guest(kvm))
				1892	return;
				1893
				1894	mutex_lock(&kvm->lock);
				1895
				1896	/*
				1897	* if userspace was terminated before unregistering the memory regions
				1898	* then lets unpin all the registered memory.
				1899	*/
				1900	if (!list_empty(head)) {
				1901	list_for_each_safe(pos, q, head) {
				1902	__unregister_enc_region_locked(kvm,
				1903	list_entry(pos, struct enc_region, list));
				1904	}
				1905	}
				1906
				1907	mutex_unlock(&kvm->lock);
				1908
				1909	sev_unbind_asid(kvm, sev->handle);
				1910	sev_asid_free(kvm);
				1911	}
				1912
				1913	static void avic_vm_destroy(struct kvm *kvm)
				1914	{
				1915	unsigned long flags;
				1916	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
				1917
				1918	if (!avic)
				1919	return;
				1920
				1921	if (kvm_svm->avic_logical_id_table_page)
				1922	__free_page(kvm_svm->avic_logical_id_table_page);
				1923	if (kvm_svm->avic_physical_id_table_page)
				1924	__free_page(kvm_svm->avic_physical_id_table_page);
				1925
				1926	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
				1927	hash_del(&kvm_svm->hnode);
				1928	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
				1929	}
				1930
				1931	static void svm_vm_destroy(struct kvm *kvm)
				1932	{
				1933	avic_vm_destroy(kvm);
				1934	sev_vm_destroy(kvm);
				1935	}
				1936
				1937	static int avic_vm_init(struct kvm *kvm)
				1938	{
				1939	unsigned long flags;
				1940	int err = -ENOMEM;
				1941	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
				1942	struct kvm_svm *k2;
				1943	struct page *p_page;
				1944	struct page *l_page;
				1945	u32 vm_id;
				1946
				1947	if (!avic)
				1948	return 0;
				1949
				1950	/* Allocating physical APIC ID table (4KB) */
				1951	p_page = alloc_page(GFP_KERNEL);
				1952	if (!p_page)
				1953	goto free_avic;
				1954
				1955	kvm_svm->avic_physical_id_table_page = p_page;
				1956	clear_page(page_address(p_page));
				1957
				1958	/* Allocating logical APIC ID table (4KB) */
				1959	l_page = alloc_page(GFP_KERNEL);
				1960	if (!l_page)
				1961	goto free_avic;
				1962
				1963	kvm_svm->avic_logical_id_table_page = l_page;
				1964	clear_page(page_address(l_page));
				1965
				1966	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
				1967	again:
				1968	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
				1969	if (vm_id == 0) { /* id is 1-based, zero is not okay */
				1970	next_vm_id_wrapped = 1;
				1971	goto again;
				1972	}
				1973	/* Is it still in use? Only possible if wrapped at least once */
				1974	if (next_vm_id_wrapped) {
				1975	hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
				1976	if (k2->avic_vm_id == vm_id)
				1977	goto again;
				1978	}
				1979	}
				1980	kvm_svm->avic_vm_id = vm_id;
				1981	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
				1982	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
				1983
				1984	return 0;
				1985
				1986	free_avic:
				1987	avic_vm_destroy(kvm);
				1988	return err;
				1989	}
				1990
				1991	static inline int
				1992	avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
				1993	{
				1994	int ret = 0;
				1995	unsigned long flags;
				1996	struct amd_svm_iommu_ir *ir;
				1997	struct vcpu_svm *svm = to_svm(vcpu);
				1998
				1999	if (!kvm_arch_has_assigned_device(vcpu->kvm))
				2000	return 0;
				2001
				2002	/*
				2003	* Here, we go through the per-vcpu ir_list to update all existing
				2004	* interrupt remapping table entry targeting this vcpu.
				2005	*/
				2006	spin_lock_irqsave(&svm->ir_list_lock, flags);
				2007
				2008	if (list_empty(&svm->ir_list))
				2009	goto out;
				2010
				2011	list_for_each_entry(ir, &svm->ir_list, node) {
				2012	ret = amd_iommu_update_ga(cpu, r, ir->data);
				2013	if (ret)
				2014	break;
				2015	}
				2016	out:
				2017	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
				2018	return ret;
				2019	}
				2020
				2021	static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
				2022	{
				2023	u64 entry;
				2024	/* ID = 0xff (broadcast), ID > 0xff (reserved) */
				2025	int h_physical_id = kvm_cpu_get_apicid(cpu);
				2026	struct vcpu_svm *svm = to_svm(vcpu);
				2027
				2028	if (!kvm_vcpu_apicv_active(vcpu))
				2029	return;
				2030
				2031	/*
				2032	* Since the host physical APIC id is 8 bits,
				2033	* we can support host APIC ID upto 255.
				2034	*/
				2035	if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
				2036	return;
				2037
				2038	entry = READ_ONCE(*(svm->avic_physical_id_cache));
				2039	WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
				2040
				2041	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
				2042	entry \|= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
				2043
				2044	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
				2045	if (svm->avic_is_running)
				2046	entry \|= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
				2047
				2048	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
				2049	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
				2050	svm->avic_is_running);
				2051	}
				2052
				2053	static void avic_vcpu_put(struct kvm_vcpu *vcpu)
				2054	{
				2055	u64 entry;
				2056	struct vcpu_svm *svm = to_svm(vcpu);
				2057
				2058	if (!kvm_vcpu_apicv_active(vcpu))
				2059	return;
				2060
				2061	entry = READ_ONCE(*(svm->avic_physical_id_cache));
				2062	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
				2063	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
				2064
				2065	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
				2066	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
				2067	}
				2068
				2069	/**
				2070	* This function is called during VCPU halt/unhalt.
				2071	*/
				2072	static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
				2073	{
				2074	struct vcpu_svm *svm = to_svm(vcpu);
				2075
				2076	svm->avic_is_running = is_run;
				2077	if (is_run)
				2078	avic_vcpu_load(vcpu, vcpu->cpu);
				2079	else
				2080	avic_vcpu_put(vcpu);
				2081	}
				2082
				2083	static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
				2084	{
				2085	struct vcpu_svm *svm = to_svm(vcpu);
				2086	u32 dummy;
				2087	u32 eax = 1;
				2088
				2089	vcpu->arch.microcode_version = 0x01000065;
				2090	svm->spec_ctrl = 0;
				2091	svm->virt_spec_ctrl = 0;
				2092
				2093	if (!init_event) {
				2094	svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE \|
				2095	MSR_IA32_APICBASE_ENABLE;
				2096	if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
				2097	svm->vcpu.arch.apic_base \|= MSR_IA32_APICBASE_BSP;
				2098	}
				2099	init_vmcb(svm);
				2100
				2101	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
				2102	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
				2103
				2104	if (kvm_vcpu_apicv_active(vcpu) && !init_event)
				2105	avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
				2106	}
				2107
				2108	static int avic_init_vcpu(struct vcpu_svm *svm)
				2109	{
				2110	int ret;
				2111
				2112	if (!kvm_vcpu_apicv_active(&svm->vcpu))
				2113	return 0;
				2114
				2115	ret = avic_init_backing_page(&svm->vcpu);
				2116	if (ret)
				2117	return ret;
				2118
				2119	INIT_LIST_HEAD(&svm->ir_list);
				2120	spin_lock_init(&svm->ir_list_lock);
				2121
				2122	return ret;
				2123	}
				2124
				2125	static struct kvm_vcpu svm_create_vcpu(struct kvm kvm, unsigned int id)
				2126	{
				2127	struct vcpu_svm *svm;
				2128	struct page *page;
				2129	struct page *msrpm_pages;
				2130	struct page *hsave_page;
				2131	struct page *nested_msrpm_pages;
				2132	int err;
				2133
				2134	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
				2135	if (!svm) {
				2136	err = -ENOMEM;
				2137	goto out;
				2138	}
				2139
				2140	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
				2141	if (err)
				2142	goto free_svm;
				2143
				2144	err = -ENOMEM;
				2145	page = alloc_page(GFP_KERNEL);
				2146	if (!page)
				2147	goto uninit;
				2148
				2149	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
				2150	if (!msrpm_pages)
				2151	goto free_page1;
				2152
				2153	nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
				2154	if (!nested_msrpm_pages)
				2155	goto free_page2;
				2156
				2157	hsave_page = alloc_page(GFP_KERNEL);
				2158	if (!hsave_page)
				2159	goto free_page3;
				2160
				2161	err = avic_init_vcpu(svm);
				2162	if (err)
				2163	goto free_page4;
				2164
				2165	/* We initialize this flag to true to make sure that the is_running
				2166	* bit would be set the first time the vcpu is loaded.
				2167	*/
				2168	svm->avic_is_running = true;
				2169
				2170	svm->nested.hsave = page_address(hsave_page);
				2171
				2172	svm->msrpm = page_address(msrpm_pages);
				2173	svm_vcpu_init_msrpm(svm->msrpm);
				2174
				2175	svm->nested.msrpm = page_address(nested_msrpm_pages);
				2176	svm_vcpu_init_msrpm(svm->nested.msrpm);
				2177
				2178	svm->vmcb = page_address(page);
				2179	clear_page(svm->vmcb);
				2180	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
				2181	svm->asid_generation = 0;
				2182	init_vmcb(svm);
				2183
				2184	svm_init_osvw(&svm->vcpu);
				2185
				2186	return &svm->vcpu;
				2187
				2188	free_page4:
				2189	__free_page(hsave_page);
				2190	free_page3:
				2191	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
				2192	free_page2:
				2193	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
				2194	free_page1:
				2195	__free_page(page);
				2196	uninit:
				2197	kvm_vcpu_uninit(&svm->vcpu);
				2198	free_svm:
				2199	kmem_cache_free(kvm_vcpu_cache, svm);
				2200	out:
				2201	return ERR_PTR(err);
				2202	}
				2203
				2204	static void svm_clear_current_vmcb(struct vmcb *vmcb)
				2205	{
				2206	int i;
				2207
				2208	for_each_online_cpu(i)
				2209	cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
				2210	}
				2211
				2212	static void svm_free_vcpu(struct kvm_vcpu *vcpu)
				2213	{
				2214	struct vcpu_svm *svm = to_svm(vcpu);
				2215
				2216	/*
				2217	* The vmcb page can be recycled, causing a false negative in
				2218	* svm_vcpu_load(). So, ensure that no logical CPU has this
				2219	* vmcb page recorded as its current vmcb.
				2220	*/
				2221	svm_clear_current_vmcb(svm->vmcb);
				2222
				2223	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
				2224	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
				2225	__free_page(virt_to_page(svm->nested.hsave));
				2226	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
				2227	kvm_vcpu_uninit(vcpu);
				2228	kmem_cache_free(kvm_vcpu_cache, svm);
				2229	}
				2230
				2231	static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
				2232	{
				2233	struct vcpu_svm *svm = to_svm(vcpu);
				2234	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
				2235	int i;
				2236
				2237	if (unlikely(cpu != vcpu->cpu)) {
				2238	svm->asid_generation = 0;
				2239	mark_all_dirty(svm->vmcb);
				2240	}
				2241
				2242	#ifdef CONFIG_X86_64
				2243	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
				2244	#endif
				2245	savesegment(fs, svm->host.fs);
				2246	savesegment(gs, svm->host.gs);
				2247	svm->host.ldt = kvm_read_ldt();
				2248
				2249	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
				2250	rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
				2251
				2252	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
				2253	u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
				2254	if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
				2255	__this_cpu_write(current_tsc_ratio, tsc_ratio);
				2256	wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
				2257	}
				2258	}
				2259	/* This assumes that the kernel never uses MSR_TSC_AUX */
				2260	if (static_cpu_has(X86_FEATURE_RDTSCP))
				2261	wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
				2262
				2263	if (sd->current_vmcb != svm->vmcb) {
				2264	sd->current_vmcb = svm->vmcb;
				2265	indirect_branch_prediction_barrier();
				2266	}
				2267	avic_vcpu_load(vcpu, cpu);
				2268	}
				2269
				2270	static void svm_vcpu_put(struct kvm_vcpu *vcpu)
				2271	{
				2272	struct vcpu_svm *svm = to_svm(vcpu);
				2273	int i;
				2274
				2275	avic_vcpu_put(vcpu);
				2276
				2277	++vcpu->stat.host_state_reload;
				2278	kvm_load_ldt(svm->host.ldt);
				2279	#ifdef CONFIG_X86_64
				2280	loadsegment(fs, svm->host.fs);
				2281	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
				2282	load_gs_index(svm->host.gs);
				2283	#else
				2284	#ifdef CONFIG_X86_32_LAZY_GS
				2285	loadsegment(gs, svm->host.gs);
				2286	#endif
				2287	#endif
				2288	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
				2289	wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
				2290	}
				2291
				2292	static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
				2293	{
				2294	avic_set_running(vcpu, false);
				2295	}
				2296
				2297	static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
				2298	{
				2299	avic_set_running(vcpu, true);
				2300	}
				2301
				2302	static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
				2303	{
				2304	struct vcpu_svm *svm = to_svm(vcpu);
				2305	unsigned long rflags = svm->vmcb->save.rflags;
				2306
				2307	if (svm->nmi_singlestep) {
				2308	/* Hide our flags if they were not set by the guest */
				2309	if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
				2310	rflags &= ~X86_EFLAGS_TF;
				2311	if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
				2312	rflags &= ~X86_EFLAGS_RF;
				2313	}
				2314	return rflags;
				2315	}
				2316
				2317	static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
				2318	{
				2319	if (to_svm(vcpu)->nmi_singlestep)
				2320	rflags \|= (X86_EFLAGS_TF \| X86_EFLAGS_RF);
				2321
				2322	/*
				2323	* Any change of EFLAGS.VM is accompanied by a reload of SS
				2324	* (caused by either a task switch or an inter-privilege IRET),
				2325	* so we do not need to update the CPL here.
				2326	*/
				2327	to_svm(vcpu)->vmcb->save.rflags = rflags;
				2328	}
				2329
				2330	static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
				2331	{
				2332	switch (reg) {
				2333	case VCPU_EXREG_PDPTR:
				2334	BUG_ON(!npt_enabled);
				2335	load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
				2336	break;
				2337	default:
				2338	BUG();
				2339	}
				2340	}
				2341
				2342	static void svm_set_vintr(struct vcpu_svm *svm)
				2343	{
				2344	set_intercept(svm, INTERCEPT_VINTR);
				2345	}
				2346
				2347	static void svm_clear_vintr(struct vcpu_svm *svm)
				2348	{
				2349	clr_intercept(svm, INTERCEPT_VINTR);
				2350	}
				2351
				2352	static struct vmcb_seg svm_seg(struct kvm_vcpu vcpu, int seg)
				2353	{
				2354	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
				2355
				2356	switch (seg) {
				2357	case VCPU_SREG_CS: return &save->cs;
				2358	case VCPU_SREG_DS: return &save->ds;
				2359	case VCPU_SREG_ES: return &save->es;
				2360	case VCPU_SREG_FS: return &save->fs;
				2361	case VCPU_SREG_GS: return &save->gs;
				2362	case VCPU_SREG_SS: return &save->ss;
				2363	case VCPU_SREG_TR: return &save->tr;
				2364	case VCPU_SREG_LDTR: return &save->ldtr;
				2365	}
				2366	BUG();
				2367	return NULL;
				2368	}
				2369
				2370	static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
				2371	{
				2372	struct vmcb_seg *s = svm_seg(vcpu, seg);
				2373
				2374	return s->base;
				2375	}
				2376
				2377	static void svm_get_segment(struct kvm_vcpu *vcpu,
				2378	struct kvm_segment *var, int seg)
				2379	{
				2380	struct vmcb_seg *s = svm_seg(vcpu, seg);
				2381
				2382	var->base = s->base;
				2383	var->limit = s->limit;
				2384	var->selector = s->selector;
				2385	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
				2386	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
				2387	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
				2388	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
				2389	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
				2390	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
				2391	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
				2392
				2393	/*
				2394	* AMD CPUs circa 2014 track the G bit for all segments except CS.
				2395	* However, the SVM spec states that the G bit is not observed by the
				2396	* CPU, and some VMware virtual CPUs drop the G bit for all segments.
				2397	* So let's synthesize a legal G bit for all segments, this helps
				2398	* running KVM nested. It also helps cross-vendor migration, because
				2399	* Intel's vmentry has a check on the 'G' bit.
				2400	*/
				2401	var->g = s->limit > 0xfffff;
				2402
				2403	/*
				2404	* AMD's VMCB does not have an explicit unusable field, so emulate it
				2405	* for cross vendor migration purposes by "not present"
				2406	*/
				2407	var->unusable = !var->present;
				2408
				2409	switch (seg) {
				2410	case VCPU_SREG_TR:
				2411	/*
				2412	* Work around a bug where the busy flag in the tr selector
				2413	* isn't exposed
				2414	*/
				2415	var->type \|= 0x2;
				2416	break;
				2417	case VCPU_SREG_DS:
				2418	case VCPU_SREG_ES:
				2419	case VCPU_SREG_FS:
				2420	case VCPU_SREG_GS:
				2421	/*
				2422	* The accessed bit must always be set in the segment
				2423	* descriptor cache, although it can be cleared in the
				2424	* descriptor, the cached bit always remains at 1. Since
				2425	* Intel has a check on this, set it here to support
				2426	* cross-vendor migration.
				2427	*/
				2428	if (!var->unusable)
				2429	var->type \|= 0x1;
				2430	break;
				2431	case VCPU_SREG_SS:
				2432	/*
				2433	* On AMD CPUs sometimes the DB bit in the segment
				2434	* descriptor is left as 1, although the whole segment has
				2435	* been made unusable. Clear it here to pass an Intel VMX
				2436	* entry check when cross vendor migrating.
				2437	*/
				2438	if (var->unusable)
				2439	var->db = 0;
				2440	/* This is symmetric with svm_set_segment() */
				2441	var->dpl = to_svm(vcpu)->vmcb->save.cpl;
				2442	break;
				2443	}
				2444	}
				2445
				2446	static int svm_get_cpl(struct kvm_vcpu *vcpu)
				2447	{
				2448	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
				2449
				2450	return save->cpl;
				2451	}
				2452
				2453	static void svm_get_idt(struct kvm_vcpu vcpu, struct desc_ptr dt)
				2454	{
				2455	struct vcpu_svm *svm = to_svm(vcpu);
				2456
				2457	dt->size = svm->vmcb->save.idtr.limit;
				2458	dt->address = svm->vmcb->save.idtr.base;
				2459	}
				2460
				2461	static void svm_set_idt(struct kvm_vcpu vcpu, struct desc_ptr dt)
				2462	{
				2463	struct vcpu_svm *svm = to_svm(vcpu);
				2464
				2465	svm->vmcb->save.idtr.limit = dt->size;
				2466	svm->vmcb->save.idtr.base = dt->address ;
				2467	mark_dirty(svm->vmcb, VMCB_DT);
				2468	}
				2469
				2470	static void svm_get_gdt(struct kvm_vcpu vcpu, struct desc_ptr dt)
				2471	{
				2472	struct vcpu_svm *svm = to_svm(vcpu);
				2473
				2474	dt->size = svm->vmcb->save.gdtr.limit;
				2475	dt->address = svm->vmcb->save.gdtr.base;
				2476	}
				2477
				2478	static void svm_set_gdt(struct kvm_vcpu vcpu, struct desc_ptr dt)
				2479	{
				2480	struct vcpu_svm *svm = to_svm(vcpu);
				2481
				2482	svm->vmcb->save.gdtr.limit = dt->size;
				2483	svm->vmcb->save.gdtr.base = dt->address ;
				2484	mark_dirty(svm->vmcb, VMCB_DT);
				2485	}
				2486
				2487	static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
				2488	{
				2489	}
				2490
				2491	static void svm_decache_cr3(struct kvm_vcpu *vcpu)
				2492	{
				2493	}
				2494
				2495	static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
				2496	{
				2497	}
				2498
				2499	static void update_cr0_intercept(struct vcpu_svm *svm)
				2500	{
				2501	ulong gcr0 = svm->vcpu.arch.cr0;
				2502	u64 *hcr0 = &svm->vmcb->save.cr0;
				2503
				2504	hcr0 = (hcr0 & ~SVM_CR0_SELECTIVE_MASK)
				2505	\| (gcr0 & SVM_CR0_SELECTIVE_MASK);
				2506
				2507	mark_dirty(svm->vmcb, VMCB_CR);
				2508
				2509	if (gcr0 == *hcr0) {
				2510	clr_cr_intercept(svm, INTERCEPT_CR0_READ);
				2511	clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
				2512	} else {
				2513	set_cr_intercept(svm, INTERCEPT_CR0_READ);
				2514	set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
				2515	}
				2516	}
				2517
				2518	static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
				2519	{
				2520	struct vcpu_svm *svm = to_svm(vcpu);
				2521
				2522	#ifdef CONFIG_X86_64
				2523	if (vcpu->arch.efer & EFER_LME) {
				2524	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
				2525	vcpu->arch.efer \|= EFER_LMA;
				2526	svm->vmcb->save.efer \|= EFER_LMA \| EFER_LME;
				2527	}
				2528
				2529	if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
				2530	vcpu->arch.efer &= ~EFER_LMA;
				2531	svm->vmcb->save.efer &= ~(EFER_LMA \| EFER_LME);
				2532	}
				2533	}
				2534	#endif
				2535	vcpu->arch.cr0 = cr0;
				2536
				2537	if (!npt_enabled)
				2538	cr0 \|= X86_CR0_PG \| X86_CR0_WP;
				2539
				2540	/*
				2541	* re-enable caching here because the QEMU bios
				2542	* does not do it - this results in some delay at
				2543	* reboot
				2544	*/
				2545	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
				2546	cr0 &= ~(X86_CR0_CD \| X86_CR0_NW);
				2547	svm->vmcb->save.cr0 = cr0;
				2548	mark_dirty(svm->vmcb, VMCB_CR);
				2549	update_cr0_intercept(svm);
				2550	}
				2551
				2552	static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
				2553	{
				2554	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
				2555	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
				2556
				2557	if (cr4 & X86_CR4_VMXE)
				2558	return 1;
				2559
				2560	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
				2561	svm_flush_tlb(vcpu, true);
				2562
				2563	vcpu->arch.cr4 = cr4;
				2564	if (!npt_enabled)
				2565	cr4 \|= X86_CR4_PAE;
				2566	cr4 \|= host_cr4_mce;
				2567	to_svm(vcpu)->vmcb->save.cr4 = cr4;
				2568	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
				2569	return 0;
				2570	}
				2571
				2572	static void svm_set_segment(struct kvm_vcpu *vcpu,
				2573	struct kvm_segment *var, int seg)
				2574	{
				2575	struct vcpu_svm *svm = to_svm(vcpu);
				2576	struct vmcb_seg *s = svm_seg(vcpu, seg);
				2577
				2578	s->base = var->base;
				2579	s->limit = var->limit;
				2580	s->selector = var->selector;
				2581	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
				2582	s->attrib \|= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
				2583	s->attrib \|= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
				2584	s->attrib \|= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
				2585	s->attrib \|= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
				2586	s->attrib \|= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
				2587	s->attrib \|= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
				2588	s->attrib \|= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
				2589
				2590	/*
				2591	* This is always accurate, except if SYSRET returned to a segment
				2592	* with SS.DPL != 3. Intel does not have this quirk, and always
				2593	* forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
				2594	* would entail passing the CPL to userspace and back.
				2595	*/
				2596	if (seg == VCPU_SREG_SS)
				2597	/* This is symmetric with svm_get_segment() */
				2598	svm->vmcb->save.cpl = (var->dpl & 3);
				2599
				2600	mark_dirty(svm->vmcb, VMCB_SEG);
				2601	}
				2602
				2603	static void update_bp_intercept(struct kvm_vcpu *vcpu)
				2604	{
				2605	struct vcpu_svm *svm = to_svm(vcpu);
				2606
				2607	clr_exception_intercept(svm, BP_VECTOR);
				2608
				2609	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
				2610	if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
				2611	set_exception_intercept(svm, BP_VECTOR);
				2612	} else
				2613	vcpu->guest_debug = 0;
				2614	}
				2615
				2616	static void new_asid(struct vcpu_svm svm, struct svm_cpu_data sd)
				2617	{
				2618	if (sd->next_asid > sd->max_asid) {
				2619	++sd->asid_generation;
				2620	sd->next_asid = sd->min_asid;
				2621	svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
				2622	}
				2623
				2624	svm->asid_generation = sd->asid_generation;
				2625	svm->vmcb->control.asid = sd->next_asid++;
				2626
				2627	mark_dirty(svm->vmcb, VMCB_ASID);
				2628	}
				2629
				2630	static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
				2631	{
				2632	return to_svm(vcpu)->vmcb->save.dr6;
				2633	}
				2634
				2635	static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
				2636	{
				2637	struct vcpu_svm *svm = to_svm(vcpu);
				2638
				2639	svm->vmcb->save.dr6 = value;
				2640	mark_dirty(svm->vmcb, VMCB_DR);
				2641	}
				2642
				2643	static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
				2644	{
				2645	struct vcpu_svm *svm = to_svm(vcpu);
				2646
				2647	get_debugreg(vcpu->arch.db[0], 0);
				2648	get_debugreg(vcpu->arch.db[1], 1);
				2649	get_debugreg(vcpu->arch.db[2], 2);
				2650	get_debugreg(vcpu->arch.db[3], 3);
				2651	vcpu->arch.dr6 = svm_get_dr6(vcpu);
				2652	vcpu->arch.dr7 = svm->vmcb->save.dr7;
				2653
				2654	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
				2655	set_dr_intercepts(svm);
				2656	}
				2657
				2658	static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
				2659	{
				2660	struct vcpu_svm *svm = to_svm(vcpu);
				2661
				2662	svm->vmcb->save.dr7 = value;
				2663	mark_dirty(svm->vmcb, VMCB_DR);
				2664	}
				2665
				2666	static int pf_interception(struct vcpu_svm *svm)
				2667	{
				2668	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
				2669	u64 error_code = svm->vmcb->control.exit_info_1;
				2670
				2671	return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
				2672	static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
				2673	svm->vmcb->control.insn_bytes : NULL,
				2674	svm->vmcb->control.insn_len);
				2675	}
				2676
				2677	static int npf_interception(struct vcpu_svm *svm)
				2678	{
				2679	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
				2680	u64 error_code = svm->vmcb->control.exit_info_1;
				2681
				2682	trace_kvm_page_fault(fault_address, error_code);
				2683	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
				2684	static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
				2685	svm->vmcb->control.insn_bytes : NULL,
				2686	svm->vmcb->control.insn_len);
				2687	}
				2688
				2689	static int db_interception(struct vcpu_svm *svm)
				2690	{
				2691	struct kvm_run *kvm_run = svm->vcpu.run;
				2692	struct kvm_vcpu *vcpu = &svm->vcpu;
				2693
				2694	if (!(svm->vcpu.guest_debug &
				2695	(KVM_GUESTDBG_SINGLESTEP \| KVM_GUESTDBG_USE_HW_BP)) &&
				2696	!svm->nmi_singlestep) {
				2697	kvm_queue_exception(&svm->vcpu, DB_VECTOR);
				2698	return 1;
				2699	}
				2700
				2701	if (svm->nmi_singlestep) {
				2702	disable_nmi_singlestep(svm);
				2703	/* Make sure we check for pending NMIs upon entry */
				2704	kvm_make_request(KVM_REQ_EVENT, vcpu);
				2705	}
				2706
				2707	if (svm->vcpu.guest_debug &
				2708	(KVM_GUESTDBG_SINGLESTEP \| KVM_GUESTDBG_USE_HW_BP)) {
				2709	kvm_run->exit_reason = KVM_EXIT_DEBUG;
				2710	kvm_run->debug.arch.pc =
				2711	svm->vmcb->save.cs.base + svm->vmcb->save.rip;
				2712	kvm_run->debug.arch.exception = DB_VECTOR;
				2713	return 0;
				2714	}
				2715
				2716	return 1;
				2717	}
				2718
				2719	static int bp_interception(struct vcpu_svm *svm)
				2720	{
				2721	struct kvm_run *kvm_run = svm->vcpu.run;
				2722
				2723	kvm_run->exit_reason = KVM_EXIT_DEBUG;
				2724	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
				2725	kvm_run->debug.arch.exception = BP_VECTOR;
				2726	return 0;
				2727	}
				2728
				2729	static int ud_interception(struct vcpu_svm *svm)
				2730	{
				2731	return handle_ud(&svm->vcpu);
				2732	}
				2733
				2734	static int ac_interception(struct vcpu_svm *svm)
				2735	{
				2736	kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
				2737	return 1;
				2738	}
				2739
				2740	static int gp_interception(struct vcpu_svm *svm)
				2741	{
				2742	struct kvm_vcpu *vcpu = &svm->vcpu;
				2743	u32 error_code = svm->vmcb->control.exit_info_1;
				2744	int er;
				2745
				2746	WARN_ON_ONCE(!enable_vmware_backdoor);
				2747
				2748	er = kvm_emulate_instruction(vcpu,
				2749	EMULTYPE_VMWARE \| EMULTYPE_NO_UD_ON_FAIL);
				2750	if (er == EMULATE_USER_EXIT)
				2751	return 0;
				2752	else if (er != EMULATE_DONE)
				2753	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
				2754	return 1;
				2755	}
				2756
				2757	static bool is_erratum_383(void)
				2758	{
				2759	int err, i;
				2760	u64 value;
				2761
				2762	if (!erratum_383_found)
				2763	return false;
				2764
				2765	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
				2766	if (err)
				2767	return false;
				2768
				2769	/* Bit 62 may or may not be set for this mce */
				2770	value &= ~(1ULL << 62);
				2771
				2772	if (value != 0xb600000000010015ULL)
				2773	return false;
				2774
				2775	/* Clear MCi_STATUS registers */
				2776	for (i = 0; i < 6; ++i)
				2777	native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
				2778
				2779	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
				2780	if (!err) {
				2781	u32 low, high;
				2782
				2783	value &= ~(1ULL << 2);
				2784	low = lower_32_bits(value);
				2785	high = upper_32_bits(value);
				2786
				2787	native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
				2788	}
				2789
				2790	/* Flush tlb to evict multi-match entries */
				2791	__flush_tlb_all();
				2792
				2793	return true;
				2794	}
				2795
				2796	static void svm_handle_mce(struct vcpu_svm *svm)
				2797	{
				2798	if (is_erratum_383()) {
				2799	/*
				2800	* Erratum 383 triggered. Guest state is corrupt so kill the
				2801	* guest.
				2802	*/
				2803	pr_err("KVM: Guest triggered AMD Erratum 383\n");
				2804
				2805	kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
				2806
				2807	return;
				2808	}
				2809
				2810	/*
				2811	* On an #MC intercept the MCE handler is not called automatically in
				2812	* the host. So do it by hand here.
				2813	*/
				2814	asm volatile (
				2815	"int $0x12\n");
				2816	/* not sure if we ever come back to this point */
				2817
				2818	return;
				2819	}
				2820
				2821	static int mc_interception(struct vcpu_svm *svm)
				2822	{
				2823	return 1;
				2824	}
				2825
				2826	static int shutdown_interception(struct vcpu_svm *svm)
				2827	{
				2828	struct kvm_run *kvm_run = svm->vcpu.run;
				2829
				2830	/*
				2831	* VMCB is undefined after a SHUTDOWN intercept
				2832	* so reinitialize it.
				2833	*/
				2834	clear_page(svm->vmcb);
				2835	init_vmcb(svm);
				2836
				2837	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
				2838	return 0;
				2839	}
				2840
				2841	static int io_interception(struct vcpu_svm *svm)
				2842	{
				2843	struct kvm_vcpu *vcpu = &svm->vcpu;
				2844	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
				2845	int size, in, string;
				2846	unsigned port;
				2847
				2848	++svm->vcpu.stat.io_exits;
				2849	string = (io_info & SVM_IOIO_STR_MASK) != 0;
				2850	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
				2851	if (string)
				2852	return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
				2853
				2854	port = io_info >> 16;
				2855	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
				2856	svm->next_rip = svm->vmcb->control.exit_info_2;
				2857
				2858	return kvm_fast_pio(&svm->vcpu, size, port, in);
				2859	}
				2860
				2861	static int nmi_interception(struct vcpu_svm *svm)
				2862	{
				2863	return 1;
				2864	}
				2865
				2866	static int intr_interception(struct vcpu_svm *svm)
				2867	{
				2868	++svm->vcpu.stat.irq_exits;
				2869	return 1;
				2870	}
				2871
				2872	static int nop_on_interception(struct vcpu_svm *svm)
				2873	{
				2874	return 1;
				2875	}
				2876
				2877	static int halt_interception(struct vcpu_svm *svm)
				2878	{
				2879	svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
				2880	return kvm_emulate_halt(&svm->vcpu);
				2881	}
				2882
				2883	static int vmmcall_interception(struct vcpu_svm *svm)
				2884	{
				2885	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				2886	return kvm_emulate_hypercall(&svm->vcpu);
				2887	}
				2888
				2889	static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
				2890	{
				2891	struct vcpu_svm *svm = to_svm(vcpu);
				2892
				2893	return svm->nested.nested_cr3;
				2894	}
				2895
				2896	static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
				2897	{
				2898	struct vcpu_svm *svm = to_svm(vcpu);
				2899	u64 cr3 = svm->nested.nested_cr3;
				2900	u64 pdpte;
				2901	int ret;
				2902
				2903	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
				2904	offset_in_page(cr3) + index * 8, 8);
				2905	if (ret)
				2906	return 0;
				2907	return pdpte;
				2908	}
				2909
				2910	static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
				2911	unsigned long root)
				2912	{
				2913	struct vcpu_svm *svm = to_svm(vcpu);
				2914
				2915	svm->vmcb->control.nested_cr3 = __sme_set(root);
				2916	mark_dirty(svm->vmcb, VMCB_NPT);
				2917	}
				2918
				2919	static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
				2920	struct x86_exception *fault)
				2921	{
				2922	struct vcpu_svm *svm = to_svm(vcpu);
				2923
				2924	if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
				2925	/*
				2926	* TODO: track the cause of the nested page fault, and
				2927	* correctly fill in the high bits of exit_info_1.
				2928	*/
				2929	svm->vmcb->control.exit_code = SVM_EXIT_NPF;
				2930	svm->vmcb->control.exit_code_hi = 0;
				2931	svm->vmcb->control.exit_info_1 = (1ULL << 32);
				2932	svm->vmcb->control.exit_info_2 = fault->address;
				2933	}
				2934
				2935	svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
				2936	svm->vmcb->control.exit_info_1 \|= fault->error_code;
				2937
				2938	/*
				2939	* The present bit is always zero for page structure faults on real
				2940	* hardware.
				2941	*/
				2942	if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
				2943	svm->vmcb->control.exit_info_1 &= ~1;
				2944
				2945	nested_svm_vmexit(svm);
				2946	}
				2947
				2948	static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
				2949	{
				2950	WARN_ON(mmu_is_nested(vcpu));
				2951	kvm_init_shadow_mmu(vcpu);
				2952	vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
				2953	vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
				2954	vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
				2955	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
				2956	vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
				2957	reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
				2958	vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
				2959	}
				2960
				2961	static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
				2962	{
				2963	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
				2964	}
				2965
				2966	static int nested_svm_check_permissions(struct vcpu_svm *svm)
				2967	{
				2968	if (!(svm->vcpu.arch.efer & EFER_SVME) \|\|
				2969	!is_paging(&svm->vcpu)) {
				2970	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
				2971	return 1;
				2972	}
				2973
				2974	if (svm->vmcb->save.cpl) {
				2975	kvm_inject_gp(&svm->vcpu, 0);
				2976	return 1;
				2977	}
				2978
				2979	return 0;
				2980	}
				2981
				2982	static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
				2983	bool has_error_code, u32 error_code)
				2984	{
				2985	int vmexit;
				2986
				2987	if (!is_guest_mode(&svm->vcpu))
				2988	return 0;
				2989
				2990	vmexit = nested_svm_intercept(svm);
				2991	if (vmexit != NESTED_EXIT_DONE)
				2992	return 0;
				2993
				2994	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
				2995	svm->vmcb->control.exit_code_hi = 0;
				2996	svm->vmcb->control.exit_info_1 = error_code;
				2997
				2998	/*
				2999	* FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
				3000	* The fix is to add the ancillary datum (CR2 or DR6) to structs
				3001	* kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
				3002	* written only when inject_pending_event runs (DR6 would written here
				3003	* too). This should be conditional on a new capability---if the
				3004	* capability is disabled, kvm_multiple_exception would write the
				3005	* ancillary information to CR2 or DR6, for backwards ABI-compatibility.
				3006	*/
				3007	if (svm->vcpu.arch.exception.nested_apf)
				3008	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
				3009	else
				3010	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
				3011
				3012	svm->nested.exit_required = true;
				3013	return vmexit;
				3014	}
				3015
				3016	/* This function returns true if it is save to enable the irq window */
				3017	static inline bool nested_svm_intr(struct vcpu_svm *svm)
				3018	{
				3019	if (!is_guest_mode(&svm->vcpu))
				3020	return true;
				3021
				3022	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
				3023	return true;
				3024
				3025	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
				3026	return false;
				3027
				3028	/*
				3029	* if vmexit was already requested (by intercepted exception
				3030	* for instance) do not overwrite it with "external interrupt"
				3031	* vmexit.
				3032	*/
				3033	if (svm->nested.exit_required)
				3034	return false;
				3035
				3036	svm->vmcb->control.exit_code = SVM_EXIT_INTR;
				3037	svm->vmcb->control.exit_info_1 = 0;
				3038	svm->vmcb->control.exit_info_2 = 0;
				3039
				3040	if (svm->nested.intercept & 1ULL) {
				3041	/*
				3042	* The #vmexit can't be emulated here directly because this
				3043	* code path runs with irqs and preemption disabled. A
				3044	* #vmexit emulation might sleep. Only signal request for
				3045	* the #vmexit here.
				3046	*/
				3047	svm->nested.exit_required = true;
				3048	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
				3049	return false;
				3050	}
				3051
				3052	return true;
				3053	}
				3054
				3055	/* This function returns true if it is save to enable the nmi window */
				3056	static inline bool nested_svm_nmi(struct vcpu_svm *svm)
				3057	{
				3058	if (!is_guest_mode(&svm->vcpu))
				3059	return true;
				3060
				3061	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
				3062	return true;
				3063
				3064	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
				3065	svm->nested.exit_required = true;
				3066
				3067	return false;
				3068	}
				3069
				3070	static void nested_svm_map(struct vcpu_svm svm, u64 gpa, struct page **_page)
				3071	{
				3072	struct page *page;
				3073
				3074	might_sleep();
				3075
				3076	page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
				3077	if (is_error_page(page))
				3078	goto error;
				3079
				3080	*_page = page;
				3081
				3082	return kmap(page);
				3083
				3084	error:
				3085	kvm_inject_gp(&svm->vcpu, 0);
				3086
				3087	return NULL;
				3088	}
				3089
				3090	static void nested_svm_unmap(struct page *page)
				3091	{
				3092	kunmap(page);
				3093	kvm_release_page_dirty(page);
				3094	}
				3095
				3096	static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
				3097	{
				3098	unsigned port, size, iopm_len;
				3099	u16 val, mask;
				3100	u8 start_bit;
				3101	u64 gpa;
				3102
				3103	if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
				3104	return NESTED_EXIT_HOST;
				3105
				3106	port = svm->vmcb->control.exit_info_1 >> 16;
				3107	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
				3108	SVM_IOIO_SIZE_SHIFT;
				3109	gpa = svm->nested.vmcb_iopm + (port / 8);
				3110	start_bit = port % 8;
				3111	iopm_len = (start_bit + size > 8) ? 2 : 1;
				3112	mask = (0xf >> (4 - size)) << start_bit;
				3113	val = 0;
				3114
				3115	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
				3116	return NESTED_EXIT_DONE;
				3117
				3118	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
				3119	}
				3120
				3121	static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
				3122	{
				3123	u32 offset, msr, value;
				3124	int write, mask;
				3125
				3126	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
				3127	return NESTED_EXIT_HOST;
				3128
				3129	msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
				3130	offset = svm_msrpm_offset(msr);
				3131	write = svm->vmcb->control.exit_info_1 & 1;
				3132	mask = 1 << ((2 * (msr & 0xf)) + write);
				3133
				3134	if (offset == MSR_INVALID)
				3135	return NESTED_EXIT_DONE;
				3136
				3137	/* Offset is in 32 bit units but need in 8 bit units */
				3138	offset *= 4;
				3139
				3140	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
				3141	return NESTED_EXIT_DONE;
				3142
				3143	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
				3144	}
				3145
				3146	/* DB exceptions for our internal use must not cause vmexit */
				3147	static int nested_svm_intercept_db(struct vcpu_svm *svm)
				3148	{
				3149	unsigned long dr6;
				3150
				3151	/* if we're not singlestepping, it's not ours */
				3152	if (!svm->nmi_singlestep)
				3153	return NESTED_EXIT_DONE;
				3154
				3155	/* if it's not a singlestep exception, it's not ours */
				3156	if (kvm_get_dr(&svm->vcpu, 6, &dr6))
				3157	return NESTED_EXIT_DONE;
				3158	if (!(dr6 & DR6_BS))
				3159	return NESTED_EXIT_DONE;
				3160
				3161	/* if the guest is singlestepping, it should get the vmexit */
				3162	if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
				3163	disable_nmi_singlestep(svm);
				3164	return NESTED_EXIT_DONE;
				3165	}
				3166
				3167	/* it's ours, the nested hypervisor must not see this one */
				3168	return NESTED_EXIT_HOST;
				3169	}
				3170
				3171	static int nested_svm_exit_special(struct vcpu_svm *svm)
				3172	{
				3173	u32 exit_code = svm->vmcb->control.exit_code;
				3174
				3175	switch (exit_code) {
				3176	case SVM_EXIT_INTR:
				3177	case SVM_EXIT_NMI:
				3178	case SVM_EXIT_EXCP_BASE + MC_VECTOR:
				3179	return NESTED_EXIT_HOST;
				3180	case SVM_EXIT_NPF:
				3181	/* For now we are always handling NPFs when using them */
				3182	if (npt_enabled)
				3183	return NESTED_EXIT_HOST;
				3184	break;
				3185	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
				3186	/* When we're shadowing, trap PFs, but not async PF */
				3187	if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
				3188	return NESTED_EXIT_HOST;
				3189	break;
				3190	default:
				3191	break;
				3192	}
				3193
				3194	return NESTED_EXIT_CONTINUE;
				3195	}
				3196
				3197	/*
				3198	* If this function returns true, this #vmexit was already handled
				3199	*/
				3200	static int nested_svm_intercept(struct vcpu_svm *svm)
				3201	{
				3202	u32 exit_code = svm->vmcb->control.exit_code;
				3203	int vmexit = NESTED_EXIT_HOST;
				3204
				3205	switch (exit_code) {
				3206	case SVM_EXIT_MSR:
				3207	vmexit = nested_svm_exit_handled_msr(svm);
				3208	break;
				3209	case SVM_EXIT_IOIO:
				3210	vmexit = nested_svm_intercept_ioio(svm);
				3211	break;
				3212	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
				3213	u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
				3214	if (svm->nested.intercept_cr & bit)
				3215	vmexit = NESTED_EXIT_DONE;
				3216	break;
				3217	}
				3218	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
				3219	u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
				3220	if (svm->nested.intercept_dr & bit)
				3221	vmexit = NESTED_EXIT_DONE;
				3222	break;
				3223	}
				3224	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
				3225	u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
				3226	if (svm->nested.intercept_exceptions & excp_bits) {
				3227	if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
				3228	vmexit = nested_svm_intercept_db(svm);
				3229	else
				3230	vmexit = NESTED_EXIT_DONE;
				3231	}
				3232	/* async page fault always cause vmexit */
				3233	else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
				3234	svm->vcpu.arch.exception.nested_apf != 0)
				3235	vmexit = NESTED_EXIT_DONE;
				3236	break;
				3237	}
				3238	case SVM_EXIT_ERR: {
				3239	vmexit = NESTED_EXIT_DONE;
				3240	break;
				3241	}
				3242	default: {
				3243	u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
				3244	if (svm->nested.intercept & exit_bits)
				3245	vmexit = NESTED_EXIT_DONE;
				3246	}
				3247	}
				3248
				3249	return vmexit;
				3250	}
				3251
				3252	static int nested_svm_exit_handled(struct vcpu_svm *svm)
				3253	{
				3254	int vmexit;
				3255
				3256	vmexit = nested_svm_intercept(svm);
				3257
				3258	if (vmexit == NESTED_EXIT_DONE)
				3259	nested_svm_vmexit(svm);
				3260
				3261	return vmexit;
				3262	}
				3263
				3264	static inline void copy_vmcb_control_area(struct vmcb dst_vmcb, struct vmcb from_vmcb)
				3265	{
				3266	struct vmcb_control_area *dst = &dst_vmcb->control;
				3267	struct vmcb_control_area *from = &from_vmcb->control;
				3268
				3269	dst->intercept_cr = from->intercept_cr;
				3270	dst->intercept_dr = from->intercept_dr;
				3271	dst->intercept_exceptions = from->intercept_exceptions;
				3272	dst->intercept = from->intercept;
				3273	dst->iopm_base_pa = from->iopm_base_pa;
				3274	dst->msrpm_base_pa = from->msrpm_base_pa;
				3275	dst->tsc_offset = from->tsc_offset;
				3276	dst->asid = from->asid;
				3277	dst->tlb_ctl = from->tlb_ctl;
				3278	dst->int_ctl = from->int_ctl;
				3279	dst->int_vector = from->int_vector;
				3280	dst->int_state = from->int_state;
				3281	dst->exit_code = from->exit_code;
				3282	dst->exit_code_hi = from->exit_code_hi;
				3283	dst->exit_info_1 = from->exit_info_1;
				3284	dst->exit_info_2 = from->exit_info_2;
				3285	dst->exit_int_info = from->exit_int_info;
				3286	dst->exit_int_info_err = from->exit_int_info_err;
				3287	dst->nested_ctl = from->nested_ctl;
				3288	dst->event_inj = from->event_inj;
				3289	dst->event_inj_err = from->event_inj_err;
				3290	dst->nested_cr3 = from->nested_cr3;
				3291	dst->virt_ext = from->virt_ext;
				3292	}
				3293
				3294	static int nested_svm_vmexit(struct vcpu_svm *svm)
				3295	{
				3296	struct vmcb *nested_vmcb;
				3297	struct vmcb *hsave = svm->nested.hsave;
				3298	struct vmcb *vmcb = svm->vmcb;
				3299	struct page *page;
				3300
				3301	trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
				3302	vmcb->control.exit_info_1,
				3303	vmcb->control.exit_info_2,
				3304	vmcb->control.exit_int_info,
				3305	vmcb->control.exit_int_info_err,
				3306	KVM_ISA_SVM);
				3307
				3308	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
				3309	if (!nested_vmcb)
				3310	return 1;
				3311
				3312	/* Exit Guest-Mode */
				3313	leave_guest_mode(&svm->vcpu);
				3314	svm->nested.vmcb = 0;
				3315
				3316	/* Give the current vmcb to the guest */
				3317	disable_gif(svm);
				3318
				3319	nested_vmcb->save.es = vmcb->save.es;
				3320	nested_vmcb->save.cs = vmcb->save.cs;
				3321	nested_vmcb->save.ss = vmcb->save.ss;
				3322	nested_vmcb->save.ds = vmcb->save.ds;
				3323	nested_vmcb->save.gdtr = vmcb->save.gdtr;
				3324	nested_vmcb->save.idtr = vmcb->save.idtr;
				3325	nested_vmcb->save.efer = svm->vcpu.arch.efer;
				3326	nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
				3327	nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
				3328	nested_vmcb->save.cr2 = vmcb->save.cr2;
				3329	nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
				3330	nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
				3331	nested_vmcb->save.rip = vmcb->save.rip;
				3332	nested_vmcb->save.rsp = vmcb->save.rsp;
				3333	nested_vmcb->save.rax = vmcb->save.rax;
				3334	nested_vmcb->save.dr7 = vmcb->save.dr7;
				3335	nested_vmcb->save.dr6 = vmcb->save.dr6;
				3336	nested_vmcb->save.cpl = vmcb->save.cpl;
				3337
				3338	nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
				3339	nested_vmcb->control.int_vector = vmcb->control.int_vector;
				3340	nested_vmcb->control.int_state = vmcb->control.int_state;
				3341	nested_vmcb->control.exit_code = vmcb->control.exit_code;
				3342	nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
				3343	nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
				3344	nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
				3345	nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
				3346	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
				3347
				3348	if (svm->nrips_enabled)
				3349	nested_vmcb->control.next_rip = vmcb->control.next_rip;
				3350
				3351	/*
				3352	* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
				3353	* to make sure that we do not lose injected events. So check event_inj
				3354	* here and copy it to exit_int_info if it is valid.
				3355	* Exit_int_info and event_inj can't be both valid because the case
				3356	* below only happens on a VMRUN instruction intercept which has
				3357	* no valid exit_int_info set.
				3358	*/
				3359	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
				3360	struct vmcb_control_area *nc = &nested_vmcb->control;
				3361
				3362	nc->exit_int_info = vmcb->control.event_inj;
				3363	nc->exit_int_info_err = vmcb->control.event_inj_err;
				3364	}
				3365
				3366	nested_vmcb->control.tlb_ctl = 0;
				3367	nested_vmcb->control.event_inj = 0;
				3368	nested_vmcb->control.event_inj_err = 0;
				3369
				3370	/* We always set V_INTR_MASKING and remember the old value in hflags */
				3371	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
				3372	nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
				3373
				3374	/* Restore the original control entries */
				3375	copy_vmcb_control_area(vmcb, hsave);
				3376
				3377	svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
				3378	kvm_clear_exception_queue(&svm->vcpu);
				3379	kvm_clear_interrupt_queue(&svm->vcpu);
				3380
				3381	svm->nested.nested_cr3 = 0;
				3382
				3383	/* Restore selected save entries */
				3384	svm->vmcb->save.es = hsave->save.es;
				3385	svm->vmcb->save.cs = hsave->save.cs;
				3386	svm->vmcb->save.ss = hsave->save.ss;
				3387	svm->vmcb->save.ds = hsave->save.ds;
				3388	svm->vmcb->save.gdtr = hsave->save.gdtr;
				3389	svm->vmcb->save.idtr = hsave->save.idtr;
				3390	kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
				3391	svm_set_efer(&svm->vcpu, hsave->save.efer);
				3392	svm_set_cr0(&svm->vcpu, hsave->save.cr0 \| X86_CR0_PE);
				3393	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
				3394	if (npt_enabled) {
				3395	svm->vmcb->save.cr3 = hsave->save.cr3;
				3396	svm->vcpu.arch.cr3 = hsave->save.cr3;
				3397	} else {
				3398	(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
				3399	}
				3400	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
				3401	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
				3402	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
				3403	svm->vmcb->save.dr7 = 0;
				3404	svm->vmcb->save.cpl = 0;
				3405	svm->vmcb->control.exit_int_info = 0;
				3406
				3407	mark_all_dirty(svm->vmcb);
				3408
				3409	nested_svm_unmap(page);
				3410
				3411	nested_svm_uninit_mmu_context(&svm->vcpu);
				3412	kvm_mmu_reset_context(&svm->vcpu);
				3413	kvm_mmu_load(&svm->vcpu);
				3414
				3415	/*
				3416	* Drop what we picked up for L2 via svm_complete_interrupts() so it
				3417	* doesn't end up in L1.
				3418	*/
				3419	svm->vcpu.arch.nmi_injected = false;
				3420	kvm_clear_exception_queue(&svm->vcpu);
				3421	kvm_clear_interrupt_queue(&svm->vcpu);
				3422
				3423	return 0;
				3424	}
				3425
				3426	static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
				3427	{
				3428	/*
				3429	* This function merges the msr permission bitmaps of kvm and the
				3430	* nested vmcb. It is optimized in that it only merges the parts where
				3431	* the kvm msr permission bitmap may contain zero bits
				3432	*/
				3433	int i;
				3434
				3435	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
				3436	return true;
				3437
				3438	for (i = 0; i < MSRPM_OFFSETS; i++) {
				3439	u32 value, p;
				3440	u64 offset;
				3441
				3442	if (msrpm_offsets[i] == 0xffffffff)
				3443	break;
				3444
				3445	p = msrpm_offsets[i];
				3446	offset = svm->nested.vmcb_msrpm + (p * 4);
				3447
				3448	if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
				3449	return false;
				3450
				3451	svm->nested.msrpm[p] = svm->msrpm[p] \| value;
				3452	}
				3453
				3454	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
				3455
				3456	return true;
				3457	}
				3458
				3459	static bool nested_vmcb_checks(struct vmcb *vmcb)
				3460	{
				3461	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
				3462	return false;
				3463
				3464	if (vmcb->control.asid == 0)
				3465	return false;
				3466
				3467	if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
				3468	!npt_enabled)
				3469	return false;
				3470
				3471	return true;
				3472	}
				3473
				3474	static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
				3475	struct vmcb nested_vmcb, struct page page)
				3476	{
				3477	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
				3478	svm->vcpu.arch.hflags \|= HF_HIF_MASK;
				3479	else
				3480	svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
				3481
				3482	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
				3483	kvm_mmu_unload(&svm->vcpu);
				3484	svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
				3485	nested_svm_init_mmu_context(&svm->vcpu);
				3486	}
				3487
				3488	/* Load the nested guest state */
				3489	svm->vmcb->save.es = nested_vmcb->save.es;
				3490	svm->vmcb->save.cs = nested_vmcb->save.cs;
				3491	svm->vmcb->save.ss = nested_vmcb->save.ss;
				3492	svm->vmcb->save.ds = nested_vmcb->save.ds;
				3493	svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
				3494	svm->vmcb->save.idtr = nested_vmcb->save.idtr;
				3495	kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
				3496	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
				3497	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
				3498	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
				3499	if (npt_enabled) {
				3500	svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
				3501	svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
				3502	} else
				3503	(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
				3504
				3505	/* Guest paging mode is active - reset mmu */
				3506	kvm_mmu_reset_context(&svm->vcpu);
				3507
				3508	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
				3509	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
				3510	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
				3511	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
				3512
				3513	/* In case we don't even reach vcpu_run, the fields are not updated */
				3514	svm->vmcb->save.rax = nested_vmcb->save.rax;
				3515	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
				3516	svm->vmcb->save.rip = nested_vmcb->save.rip;
				3517	svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
				3518	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
				3519	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
				3520
				3521	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
				3522	svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
				3523
				3524	/* cache intercepts */
				3525	svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
				3526	svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
				3527	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
				3528	svm->nested.intercept = nested_vmcb->control.intercept;
				3529
				3530	svm_flush_tlb(&svm->vcpu, true);
				3531	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl \| V_INTR_MASKING_MASK;
				3532	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
				3533	svm->vcpu.arch.hflags \|= HF_VINTR_MASK;
				3534	else
				3535	svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
				3536
				3537	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
				3538	/* We only want the cr8 intercept bits of the guest */
				3539	clr_cr_intercept(svm, INTERCEPT_CR8_READ);
				3540	clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
				3541	}
				3542
				3543	/* We don't want to see VMMCALLs from a nested guest */
				3544	clr_intercept(svm, INTERCEPT_VMMCALL);
				3545
				3546	svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
				3547	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
				3548
				3549	svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
				3550	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
				3551	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
				3552	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
				3553	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
				3554
				3555	nested_svm_unmap(page);
				3556
				3557	/* Enter Guest-Mode */
				3558	enter_guest_mode(&svm->vcpu);
				3559
				3560	/*
				3561	* Merge guest and host intercepts - must be called with vcpu in
				3562	* guest-mode to take affect here
				3563	*/
				3564	recalc_intercepts(svm);
				3565
				3566	svm->nested.vmcb = vmcb_gpa;
				3567
				3568	enable_gif(svm);
				3569
				3570	mark_all_dirty(svm->vmcb);
				3571	}
				3572
				3573	static bool nested_svm_vmrun(struct vcpu_svm *svm)
				3574	{
				3575	struct vmcb *nested_vmcb;
				3576	struct vmcb *hsave = svm->nested.hsave;
				3577	struct vmcb *vmcb = svm->vmcb;
				3578	struct page *page;
				3579	u64 vmcb_gpa;
				3580
				3581	vmcb_gpa = svm->vmcb->save.rax;
				3582
				3583	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
				3584	if (!nested_vmcb)
				3585	return false;
				3586
				3587	if (!nested_vmcb_checks(nested_vmcb)) {
				3588	nested_vmcb->control.exit_code = SVM_EXIT_ERR;
				3589	nested_vmcb->control.exit_code_hi = 0;
				3590	nested_vmcb->control.exit_info_1 = 0;
				3591	nested_vmcb->control.exit_info_2 = 0;
				3592
				3593	nested_svm_unmap(page);
				3594
				3595	return false;
				3596	}
				3597
				3598	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
				3599	nested_vmcb->save.rip,
				3600	nested_vmcb->control.int_ctl,
				3601	nested_vmcb->control.event_inj,
				3602	nested_vmcb->control.nested_ctl);
				3603
				3604	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
				3605	nested_vmcb->control.intercept_cr >> 16,
				3606	nested_vmcb->control.intercept_exceptions,
				3607	nested_vmcb->control.intercept);
				3608
				3609	/* Clear internal status */
				3610	kvm_clear_exception_queue(&svm->vcpu);
				3611	kvm_clear_interrupt_queue(&svm->vcpu);
				3612
				3613	/*
				3614	* Save the old vmcb, so we don't need to pick what we save, but can
				3615	* restore everything when a VMEXIT occurs
				3616	*/
				3617	hsave->save.es = vmcb->save.es;
				3618	hsave->save.cs = vmcb->save.cs;
				3619	hsave->save.ss = vmcb->save.ss;
				3620	hsave->save.ds = vmcb->save.ds;
				3621	hsave->save.gdtr = vmcb->save.gdtr;
				3622	hsave->save.idtr = vmcb->save.idtr;
				3623	hsave->save.efer = svm->vcpu.arch.efer;
				3624	hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
				3625	hsave->save.cr4 = svm->vcpu.arch.cr4;
				3626	hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
				3627	hsave->save.rip = kvm_rip_read(&svm->vcpu);
				3628	hsave->save.rsp = vmcb->save.rsp;
				3629	hsave->save.rax = vmcb->save.rax;
				3630	if (npt_enabled)
				3631	hsave->save.cr3 = vmcb->save.cr3;
				3632	else
				3633	hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
				3634
				3635	copy_vmcb_control_area(hsave, vmcb);
				3636
				3637	enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
				3638
				3639	return true;
				3640	}
				3641
				3642	static void nested_svm_vmloadsave(struct vmcb from_vmcb, struct vmcb to_vmcb)
				3643	{
				3644	to_vmcb->save.fs = from_vmcb->save.fs;
				3645	to_vmcb->save.gs = from_vmcb->save.gs;
				3646	to_vmcb->save.tr = from_vmcb->save.tr;
				3647	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
				3648	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
				3649	to_vmcb->save.star = from_vmcb->save.star;
				3650	to_vmcb->save.lstar = from_vmcb->save.lstar;
				3651	to_vmcb->save.cstar = from_vmcb->save.cstar;
				3652	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
				3653	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
				3654	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
				3655	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
				3656	}
				3657
				3658	static int vmload_interception(struct vcpu_svm *svm)
				3659	{
				3660	struct vmcb *nested_vmcb;
				3661	struct page *page;
				3662	int ret;
				3663
				3664	if (nested_svm_check_permissions(svm))
				3665	return 1;
				3666
				3667	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
				3668	if (!nested_vmcb)
				3669	return 1;
				3670
				3671	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				3672	ret = kvm_skip_emulated_instruction(&svm->vcpu);
				3673
				3674	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
				3675	nested_svm_unmap(page);
				3676
				3677	return ret;
				3678	}
				3679
				3680	static int vmsave_interception(struct vcpu_svm *svm)
				3681	{
				3682	struct vmcb *nested_vmcb;
				3683	struct page *page;
				3684	int ret;
				3685
				3686	if (nested_svm_check_permissions(svm))
				3687	return 1;
				3688
				3689	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
				3690	if (!nested_vmcb)
				3691	return 1;
				3692
				3693	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				3694	ret = kvm_skip_emulated_instruction(&svm->vcpu);
				3695
				3696	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
				3697	nested_svm_unmap(page);
				3698
				3699	return ret;
				3700	}
				3701
				3702	static int vmrun_interception(struct vcpu_svm *svm)
				3703	{
				3704	if (nested_svm_check_permissions(svm))
				3705	return 1;
				3706
				3707	/* Save rip after vmrun instruction */
				3708	kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
				3709
				3710	if (!nested_svm_vmrun(svm))
				3711	return 1;
				3712
				3713	if (!nested_svm_vmrun_msrpm(svm))
				3714	goto failed;
				3715
				3716	return 1;
				3717
				3718	failed:
				3719
				3720	svm->vmcb->control.exit_code = SVM_EXIT_ERR;
				3721	svm->vmcb->control.exit_code_hi = 0;
				3722	svm->vmcb->control.exit_info_1 = 0;
				3723	svm->vmcb->control.exit_info_2 = 0;
				3724
				3725	nested_svm_vmexit(svm);
				3726
				3727	return 1;
				3728	}
				3729
				3730	static int stgi_interception(struct vcpu_svm *svm)
				3731	{
				3732	int ret;
				3733
				3734	if (nested_svm_check_permissions(svm))
				3735	return 1;
				3736
				3737	/*
				3738	* If VGIF is enabled, the STGI intercept is only added to
				3739	* detect the opening of the SMI/NMI window; remove it now.
				3740	*/
				3741	if (vgif_enabled(svm))
				3742	clr_intercept(svm, INTERCEPT_STGI);
				3743
				3744	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				3745	ret = kvm_skip_emulated_instruction(&svm->vcpu);
				3746	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
				3747
				3748	enable_gif(svm);
				3749
				3750	return ret;
				3751	}
				3752
				3753	static int clgi_interception(struct vcpu_svm *svm)
				3754	{
				3755	int ret;
				3756
				3757	if (nested_svm_check_permissions(svm))
				3758	return 1;
				3759
				3760	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				3761	ret = kvm_skip_emulated_instruction(&svm->vcpu);
				3762
				3763	disable_gif(svm);
				3764
				3765	/* After a CLGI no interrupts should come */
				3766	if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
				3767	svm_clear_vintr(svm);
				3768	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
				3769	mark_dirty(svm->vmcb, VMCB_INTR);
				3770	}
				3771
				3772	return ret;
				3773	}
				3774
				3775	static int invlpga_interception(struct vcpu_svm *svm)
				3776	{
				3777	struct kvm_vcpu *vcpu = &svm->vcpu;
				3778
				3779	trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
				3780	kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
				3781
				3782	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
				3783	kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
				3784
				3785	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				3786	return kvm_skip_emulated_instruction(&svm->vcpu);
				3787	}
				3788
				3789	static int skinit_interception(struct vcpu_svm *svm)
				3790	{
				3791	trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
				3792
				3793	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
				3794	return 1;
				3795	}
				3796
				3797	static int wbinvd_interception(struct vcpu_svm *svm)
				3798	{
				3799	return kvm_emulate_wbinvd(&svm->vcpu);
				3800	}
				3801
				3802	static int xsetbv_interception(struct vcpu_svm *svm)
				3803	{
				3804	u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
				3805	u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
				3806
				3807	if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
				3808	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
				3809	return kvm_skip_emulated_instruction(&svm->vcpu);
				3810	}
				3811
				3812	return 1;
				3813	}
				3814
				3815	static int task_switch_interception(struct vcpu_svm *svm)
				3816	{
				3817	u16 tss_selector;
				3818	int reason;
				3819	int int_type = svm->vmcb->control.exit_int_info &
				3820	SVM_EXITINTINFO_TYPE_MASK;
				3821	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
				3822	uint32_t type =
				3823	svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
				3824	uint32_t idt_v =
				3825	svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
				3826	bool has_error_code = false;
				3827	u32 error_code = 0;
				3828
				3829	tss_selector = (u16)svm->vmcb->control.exit_info_1;
				3830
				3831	if (svm->vmcb->control.exit_info_2 &
				3832	(1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
				3833	reason = TASK_SWITCH_IRET;
				3834	else if (svm->vmcb->control.exit_info_2 &
				3835	(1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
				3836	reason = TASK_SWITCH_JMP;
				3837	else if (idt_v)
				3838	reason = TASK_SWITCH_GATE;
				3839	else
				3840	reason = TASK_SWITCH_CALL;
				3841
				3842	if (reason == TASK_SWITCH_GATE) {
				3843	switch (type) {
				3844	case SVM_EXITINTINFO_TYPE_NMI:
				3845	svm->vcpu.arch.nmi_injected = false;
				3846	break;
				3847	case SVM_EXITINTINFO_TYPE_EXEPT:
				3848	if (svm->vmcb->control.exit_info_2 &
				3849	(1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
				3850	has_error_code = true;
				3851	error_code =
				3852	(u32)svm->vmcb->control.exit_info_2;
				3853	}
				3854	kvm_clear_exception_queue(&svm->vcpu);
				3855	break;
				3856	case SVM_EXITINTINFO_TYPE_INTR:
				3857	kvm_clear_interrupt_queue(&svm->vcpu);
				3858	break;
				3859	default:
				3860	break;
				3861	}
				3862	}
				3863
				3864	if (reason != TASK_SWITCH_GATE \|\|
				3865	int_type == SVM_EXITINTINFO_TYPE_SOFT \|\|
				3866	(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
				3867	(int_vec == OF_VECTOR \|\| int_vec == BP_VECTOR)))
				3868	skip_emulated_instruction(&svm->vcpu);
				3869
				3870	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
				3871	int_vec = -1;
				3872
				3873	if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
				3874	has_error_code, error_code) == EMULATE_FAIL) {
				3875	svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
				3876	svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
				3877	svm->vcpu.run->internal.ndata = 0;
				3878	return 0;
				3879	}
				3880	return 1;
				3881	}
				3882
				3883	static int cpuid_interception(struct vcpu_svm *svm)
				3884	{
				3885	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
				3886	return kvm_emulate_cpuid(&svm->vcpu);
				3887	}
				3888
				3889	static int iret_interception(struct vcpu_svm *svm)
				3890	{
				3891	++svm->vcpu.stat.nmi_window_exits;
				3892	clr_intercept(svm, INTERCEPT_IRET);
				3893	svm->vcpu.arch.hflags \|= HF_IRET_MASK;
				3894	svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
				3895	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
				3896	return 1;
				3897	}
				3898
				3899	static int invlpg_interception(struct vcpu_svm *svm)
				3900	{
				3901	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
				3902	return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
				3903
				3904	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
				3905	return kvm_skip_emulated_instruction(&svm->vcpu);
				3906	}
				3907
				3908	static int emulate_on_interception(struct vcpu_svm *svm)
				3909	{
				3910	return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
				3911	}
				3912
				3913	static int rsm_interception(struct vcpu_svm *svm)
				3914	{
				3915	return kvm_emulate_instruction_from_buffer(&svm->vcpu,
				3916	rsm_ins_bytes, 2) == EMULATE_DONE;
				3917	}
				3918
				3919	static int rdpmc_interception(struct vcpu_svm *svm)
				3920	{
				3921	int err;
				3922
				3923	if (!static_cpu_has(X86_FEATURE_NRIPS))
				3924	return emulate_on_interception(svm);
				3925
				3926	err = kvm_rdpmc(&svm->vcpu);
				3927	return kvm_complete_insn_gp(&svm->vcpu, err);
				3928	}
				3929
				3930	static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
				3931	unsigned long val)
				3932	{
				3933	unsigned long cr0 = svm->vcpu.arch.cr0;
				3934	bool ret = false;
				3935	u64 intercept;
				3936
				3937	intercept = svm->nested.intercept;
				3938
				3939	if (!is_guest_mode(&svm->vcpu) \|\|
				3940	(!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
				3941	return false;
				3942
				3943	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
				3944	val &= ~SVM_CR0_SELECTIVE_MASK;
				3945
				3946	if (cr0 ^ val) {
				3947	svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
				3948	ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
				3949	}
				3950
				3951	return ret;
				3952	}
				3953
				3954	#define CR_VALID (1ULL << 63)
				3955
				3956	static int cr_interception(struct vcpu_svm *svm)
				3957	{
				3958	int reg, cr;
				3959	unsigned long val;
				3960	int err;
				3961
				3962	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
				3963	return emulate_on_interception(svm);
				3964
				3965	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
				3966	return emulate_on_interception(svm);
				3967
				3968	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
				3969	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
				3970	cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
				3971	else
				3972	cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
				3973
				3974	err = 0;
				3975	if (cr >= 16) { /* mov to cr */
				3976	cr -= 16;
				3977	val = kvm_register_read(&svm->vcpu, reg);
				3978	switch (cr) {
				3979	case 0:
				3980	if (!check_selective_cr0_intercepted(svm, val))
				3981	err = kvm_set_cr0(&svm->vcpu, val);
				3982	else
				3983	return 1;
				3984
				3985	break;
				3986	case 3:
				3987	err = kvm_set_cr3(&svm->vcpu, val);
				3988	break;
				3989	case 4:
				3990	err = kvm_set_cr4(&svm->vcpu, val);
				3991	break;
				3992	case 8:
				3993	err = kvm_set_cr8(&svm->vcpu, val);
				3994	break;
				3995	default:
				3996	WARN(1, "unhandled write to CR%d", cr);
				3997	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
				3998	return 1;
				3999	}
				4000	} else { /* mov from cr */
				4001	switch (cr) {
				4002	case 0:
				4003	val = kvm_read_cr0(&svm->vcpu);
				4004	break;
				4005	case 2:
				4006	val = svm->vcpu.arch.cr2;
				4007	break;
				4008	case 3:
				4009	val = kvm_read_cr3(&svm->vcpu);
				4010	break;
				4011	case 4:
				4012	val = kvm_read_cr4(&svm->vcpu);
				4013	break;
				4014	case 8:
				4015	val = kvm_get_cr8(&svm->vcpu);
				4016	break;
				4017	default:
				4018	WARN(1, "unhandled read from CR%d", cr);
				4019	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
				4020	return 1;
				4021	}
				4022	kvm_register_write(&svm->vcpu, reg, val);
				4023	}
				4024	return kvm_complete_insn_gp(&svm->vcpu, err);
				4025	}
				4026
				4027	static int dr_interception(struct vcpu_svm *svm)
				4028	{
				4029	int reg, dr;
				4030	unsigned long val;
				4031
				4032	if (svm->vcpu.guest_debug == 0) {
				4033	/*
				4034	* No more DR vmexits; force a reload of the debug registers
				4035	* and reenter on this instruction. The next vmexit will
				4036	* retrieve the full state of the debug registers.
				4037	*/
				4038	clr_dr_intercepts(svm);
				4039	svm->vcpu.arch.switch_db_regs \|= KVM_DEBUGREG_WONT_EXIT;
				4040	return 1;
				4041	}
				4042
				4043	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
				4044	return emulate_on_interception(svm);
				4045
				4046	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
				4047	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
				4048
				4049	if (dr >= 16) { /* mov to DRn */
				4050	if (!kvm_require_dr(&svm->vcpu, dr - 16))
				4051	return 1;
				4052	val = kvm_register_read(&svm->vcpu, reg);
				4053	kvm_set_dr(&svm->vcpu, dr - 16, val);
				4054	} else {
				4055	if (!kvm_require_dr(&svm->vcpu, dr))
				4056	return 1;
				4057	kvm_get_dr(&svm->vcpu, dr, &val);
				4058	kvm_register_write(&svm->vcpu, reg, val);
				4059	}
				4060
				4061	return kvm_skip_emulated_instruction(&svm->vcpu);
				4062	}
				4063
				4064	static int cr8_write_interception(struct vcpu_svm *svm)
				4065	{
				4066	struct kvm_run *kvm_run = svm->vcpu.run;
				4067	int r;
				4068
				4069	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
				4070	/* instruction emulation calls kvm_set_cr8() */
				4071	r = cr_interception(svm);
				4072	if (lapic_in_kernel(&svm->vcpu))
				4073	return r;
				4074	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
				4075	return r;
				4076	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
				4077	return 0;
				4078	}
				4079
				4080	static int svm_get_msr_feature(struct kvm_msr_entry *msr)
				4081	{
				4082	msr->data = 0;
				4083
				4084	switch (msr->index) {
				4085	case MSR_F10H_DECFG:
				4086	if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
				4087	msr->data \|= MSR_F10H_DECFG_LFENCE_SERIALIZE;
				4088	break;
				4089	default:
				4090	return 1;
				4091	}
				4092
				4093	return 0;
				4094	}
				4095
				4096	static int svm_get_msr(struct kvm_vcpu vcpu, struct msr_data msr_info)
				4097	{
				4098	struct vcpu_svm *svm = to_svm(vcpu);
				4099
				4100	switch (msr_info->index) {
				4101	case MSR_STAR:
				4102	msr_info->data = svm->vmcb->save.star;
				4103	break;
				4104	#ifdef CONFIG_X86_64
				4105	case MSR_LSTAR:
				4106	msr_info->data = svm->vmcb->save.lstar;
				4107	break;
				4108	case MSR_CSTAR:
				4109	msr_info->data = svm->vmcb->save.cstar;
				4110	break;
				4111	case MSR_KERNEL_GS_BASE:
				4112	msr_info->data = svm->vmcb->save.kernel_gs_base;
				4113	break;
				4114	case MSR_SYSCALL_MASK:
				4115	msr_info->data = svm->vmcb->save.sfmask;
				4116	break;
				4117	#endif
				4118	case MSR_IA32_SYSENTER_CS:
				4119	msr_info->data = svm->vmcb->save.sysenter_cs;
				4120	break;
				4121	case MSR_IA32_SYSENTER_EIP:
				4122	msr_info->data = svm->sysenter_eip;
				4123	break;
				4124	case MSR_IA32_SYSENTER_ESP:
				4125	msr_info->data = svm->sysenter_esp;
				4126	break;
				4127	case MSR_TSC_AUX:
				4128	if (!boot_cpu_has(X86_FEATURE_RDTSCP))
				4129	return 1;
				4130	msr_info->data = svm->tsc_aux;
				4131	break;
				4132	/*
				4133	* Nobody will change the following 5 values in the VMCB so we can
				4134	* safely return them on rdmsr. They will always be 0 until LBRV is
				4135	* implemented.
				4136	*/
				4137	case MSR_IA32_DEBUGCTLMSR:
				4138	msr_info->data = svm->vmcb->save.dbgctl;
				4139	break;
				4140	case MSR_IA32_LASTBRANCHFROMIP:
				4141	msr_info->data = svm->vmcb->save.br_from;
				4142	break;
				4143	case MSR_IA32_LASTBRANCHTOIP:
				4144	msr_info->data = svm->vmcb->save.br_to;
				4145	break;
				4146	case MSR_IA32_LASTINTFROMIP:
				4147	msr_info->data = svm->vmcb->save.last_excp_from;
				4148	break;
				4149	case MSR_IA32_LASTINTTOIP:
				4150	msr_info->data = svm->vmcb->save.last_excp_to;
				4151	break;
				4152	case MSR_VM_HSAVE_PA:
				4153	msr_info->data = svm->nested.hsave_msr;
				4154	break;
				4155	case MSR_VM_CR:
				4156	msr_info->data = svm->nested.vm_cr_msr;
				4157	break;
				4158	case MSR_IA32_SPEC_CTRL:
				4159	if (!msr_info->host_initiated &&
				4160	!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
				4161	!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
				4162	return 1;
				4163
				4164	msr_info->data = svm->spec_ctrl;
				4165	break;
				4166	case MSR_AMD64_VIRT_SPEC_CTRL:
				4167	if (!msr_info->host_initiated &&
				4168	!guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
				4169	return 1;
				4170
				4171	msr_info->data = svm->virt_spec_ctrl;
				4172	break;
				4173	case MSR_F15H_IC_CFG: {
				4174
				4175	int family, model;
				4176
				4177	family = guest_cpuid_family(vcpu);
				4178	model = guest_cpuid_model(vcpu);
				4179
				4180	if (family < 0 \|\| model < 0)
				4181	return kvm_get_msr_common(vcpu, msr_info);
				4182
				4183	msr_info->data = 0;
				4184
				4185	if (family == 0x15 &&
				4186	(model >= 0x2 && model < 0x20))
				4187	msr_info->data = 0x1E;
				4188	}
				4189	break;
				4190	case MSR_F10H_DECFG:
				4191	msr_info->data = svm->msr_decfg;
				4192	break;
				4193	default:
				4194	return kvm_get_msr_common(vcpu, msr_info);
				4195	}
				4196	return 0;
				4197	}
				4198
				4199	static int rdmsr_interception(struct vcpu_svm *svm)
				4200	{
				4201	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
				4202	struct msr_data msr_info;
				4203
				4204	msr_info.index = ecx;
				4205	msr_info.host_initiated = false;
				4206	if (svm_get_msr(&svm->vcpu, &msr_info)) {
				4207	trace_kvm_msr_read_ex(ecx);
				4208	kvm_inject_gp(&svm->vcpu, 0);
				4209	return 1;
				4210	} else {
				4211	trace_kvm_msr_read(ecx, msr_info.data);
				4212
				4213	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
				4214	msr_info.data & 0xffffffff);
				4215	kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
				4216	msr_info.data >> 32);
				4217	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
				4218	return kvm_skip_emulated_instruction(&svm->vcpu);
				4219	}
				4220	}
				4221
				4222	static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
				4223	{
				4224	struct vcpu_svm *svm = to_svm(vcpu);
				4225	int svm_dis, chg_mask;
				4226
				4227	if (data & ~SVM_VM_CR_VALID_MASK)
				4228	return 1;
				4229
				4230	chg_mask = SVM_VM_CR_VALID_MASK;
				4231
				4232	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
				4233	chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK \| SVM_VM_CR_SVM_DIS_MASK);
				4234
				4235	svm->nested.vm_cr_msr &= ~chg_mask;
				4236	svm->nested.vm_cr_msr \|= (data & chg_mask);
				4237
				4238	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
				4239
				4240	/* check for svm_disable while efer.svme is set */
				4241	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
				4242	return 1;
				4243
				4244	return 0;
				4245	}
				4246
				4247	static int svm_set_msr(struct kvm_vcpu vcpu, struct msr_data msr)
				4248	{
				4249	struct vcpu_svm *svm = to_svm(vcpu);
				4250
				4251	u32 ecx = msr->index;
				4252	u64 data = msr->data;
				4253	switch (ecx) {
				4254	case MSR_IA32_CR_PAT:
				4255	if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
				4256	return 1;
				4257	vcpu->arch.pat = data;
				4258	svm->vmcb->save.g_pat = data;
				4259	mark_dirty(svm->vmcb, VMCB_NPT);
				4260	break;
				4261	case MSR_IA32_SPEC_CTRL:
				4262	if (!msr->host_initiated &&
				4263	!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
				4264	!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
				4265	return 1;
				4266
				4267	/* The STIBP bit doesn't fault even if it's not advertised */
				4268	if (data & ~(SPEC_CTRL_IBRS \| SPEC_CTRL_STIBP \| SPEC_CTRL_SSBD))
				4269	return 1;
				4270
				4271	svm->spec_ctrl = data;
				4272
				4273	if (!data)
				4274	break;
				4275
				4276	/*
				4277	* For non-nested:
				4278	* When it's written (to non-zero) for the first time, pass
				4279	* it through.
				4280	*
				4281	* For nested:
				4282	* The handling of the MSR bitmap for L2 guests is done in
				4283	* nested_svm_vmrun_msrpm.
				4284	* We update the L1 MSR bit as well since it will end up
				4285	* touching the MSR anyway now.
				4286	*/
				4287	set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
				4288	break;
				4289	case MSR_IA32_PRED_CMD:
				4290	if (!msr->host_initiated &&
				4291	!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
				4292	return 1;
				4293
				4294	if (data & ~PRED_CMD_IBPB)
				4295	return 1;
				4296
				4297	if (!data)
				4298	break;
				4299
				4300	wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
				4301	if (is_guest_mode(vcpu))
				4302	break;
				4303	set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
				4304	break;
				4305	case MSR_AMD64_VIRT_SPEC_CTRL:
				4306	if (!msr->host_initiated &&
				4307	!guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
				4308	return 1;
				4309
				4310	if (data & ~SPEC_CTRL_SSBD)
				4311	return 1;
				4312
				4313	svm->virt_spec_ctrl = data;
				4314	break;
				4315	case MSR_STAR:
				4316	svm->vmcb->save.star = data;
				4317	break;
				4318	#ifdef CONFIG_X86_64
				4319	case MSR_LSTAR:
				4320	svm->vmcb->save.lstar = data;
				4321	break;
				4322	case MSR_CSTAR:
				4323	svm->vmcb->save.cstar = data;
				4324	break;
				4325	case MSR_KERNEL_GS_BASE:
				4326	svm->vmcb->save.kernel_gs_base = data;
				4327	break;
				4328	case MSR_SYSCALL_MASK:
				4329	svm->vmcb->save.sfmask = data;
				4330	break;
				4331	#endif
				4332	case MSR_IA32_SYSENTER_CS:
				4333	svm->vmcb->save.sysenter_cs = data;
				4334	break;
				4335	case MSR_IA32_SYSENTER_EIP:
				4336	svm->sysenter_eip = data;
				4337	svm->vmcb->save.sysenter_eip = data;
				4338	break;
				4339	case MSR_IA32_SYSENTER_ESP:
				4340	svm->sysenter_esp = data;
				4341	svm->vmcb->save.sysenter_esp = data;
				4342	break;
				4343	case MSR_TSC_AUX:
				4344	if (!boot_cpu_has(X86_FEATURE_RDTSCP))
				4345	return 1;
				4346
				4347	/*
				4348	* This is rare, so we update the MSR here instead of using
				4349	* direct_access_msrs. Doing that would require a rdmsr in
				4350	* svm_vcpu_put.
				4351	*/
				4352	svm->tsc_aux = data;
				4353	wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
				4354	break;
				4355	case MSR_IA32_DEBUGCTLMSR:
				4356	if (!boot_cpu_has(X86_FEATURE_LBRV)) {
				4357	vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
				4358	__func__, data);
				4359	break;
				4360	}
				4361	if (data & DEBUGCTL_RESERVED_BITS)
				4362	return 1;
				4363
				4364	svm->vmcb->save.dbgctl = data;
				4365	mark_dirty(svm->vmcb, VMCB_LBR);
				4366	if (data & (1ULL<<0))
				4367	svm_enable_lbrv(svm);
				4368	else
				4369	svm_disable_lbrv(svm);
				4370	break;
				4371	case MSR_VM_HSAVE_PA:
				4372	svm->nested.hsave_msr = data;
				4373	break;
				4374	case MSR_VM_CR:
				4375	return svm_set_vm_cr(vcpu, data);
				4376	case MSR_VM_IGNNE:
				4377	vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
				4378	break;
				4379	case MSR_F10H_DECFG: {
				4380	struct kvm_msr_entry msr_entry;
				4381
				4382	msr_entry.index = msr->index;
				4383	if (svm_get_msr_feature(&msr_entry))
				4384	return 1;
				4385
				4386	/* Check the supported bits */
				4387	if (data & ~msr_entry.data)
				4388	return 1;
				4389
				4390	/* Don't allow the guest to change a bit, #GP */
				4391	if (!msr->host_initiated && (data ^ msr_entry.data))
				4392	return 1;
				4393
				4394	svm->msr_decfg = data;
				4395	break;
				4396	}
				4397	case MSR_IA32_APICBASE:
				4398	if (kvm_vcpu_apicv_active(vcpu))
				4399	avic_update_vapic_bar(to_svm(vcpu), data);
				4400	/* Follow through */
				4401	default:
				4402	return kvm_set_msr_common(vcpu, msr);
				4403	}
				4404	return 0;
				4405	}
				4406
				4407	static int wrmsr_interception(struct vcpu_svm *svm)
				4408	{
				4409	struct msr_data msr;
				4410	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
				4411	u64 data = kvm_read_edx_eax(&svm->vcpu);
				4412
				4413	msr.data = data;
				4414	msr.index = ecx;
				4415	msr.host_initiated = false;
				4416
				4417	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
				4418	if (kvm_set_msr(&svm->vcpu, &msr)) {
				4419	trace_kvm_msr_write_ex(ecx, data);
				4420	kvm_inject_gp(&svm->vcpu, 0);
				4421	return 1;
				4422	} else {
				4423	trace_kvm_msr_write(ecx, data);
				4424	return kvm_skip_emulated_instruction(&svm->vcpu);
				4425	}
				4426	}
				4427
				4428	static int msr_interception(struct vcpu_svm *svm)
				4429	{
				4430	if (svm->vmcb->control.exit_info_1)
				4431	return wrmsr_interception(svm);
				4432	else
				4433	return rdmsr_interception(svm);
				4434	}
				4435
				4436	static int interrupt_window_interception(struct vcpu_svm *svm)
				4437	{
				4438	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
				4439	svm_clear_vintr(svm);
				4440	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
				4441	mark_dirty(svm->vmcb, VMCB_INTR);
				4442	++svm->vcpu.stat.irq_window_exits;
				4443	return 1;
				4444	}
				4445
				4446	static int pause_interception(struct vcpu_svm *svm)
				4447	{
				4448	struct kvm_vcpu *vcpu = &svm->vcpu;
				4449	bool in_kernel = (svm_get_cpl(vcpu) == 0);
				4450
				4451	if (pause_filter_thresh)
				4452	grow_ple_window(vcpu);
				4453
				4454	kvm_vcpu_on_spin(vcpu, in_kernel);
				4455	return 1;
				4456	}
				4457
				4458	static int nop_interception(struct vcpu_svm *svm)
				4459	{
				4460	return kvm_skip_emulated_instruction(&(svm->vcpu));
				4461	}
				4462
				4463	static int monitor_interception(struct vcpu_svm *svm)
				4464	{
				4465	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
				4466	return nop_interception(svm);
				4467	}
				4468
				4469	static int mwait_interception(struct vcpu_svm *svm)
				4470	{
				4471	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
				4472	return nop_interception(svm);
				4473	}
				4474
				4475	enum avic_ipi_failure_cause {
				4476	AVIC_IPI_FAILURE_INVALID_INT_TYPE,
				4477	AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
				4478	AVIC_IPI_FAILURE_INVALID_TARGET,
				4479	AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
				4480	};
				4481
				4482	static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
				4483	{
				4484	u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
				4485	u32 icrl = svm->vmcb->control.exit_info_1;
				4486	u32 id = svm->vmcb->control.exit_info_2 >> 32;
				4487	u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
				4488	struct kvm_lapic *apic = svm->vcpu.arch.apic;
				4489
				4490	trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
				4491
				4492	switch (id) {
				4493	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
				4494	/*
				4495	* AVIC hardware handles the generation of
				4496	* IPIs when the specified Message Type is Fixed
				4497	* (also known as fixed delivery mode) and
				4498	* the Trigger Mode is edge-triggered. The hardware
				4499	* also supports self and broadcast delivery modes
				4500	* specified via the Destination Shorthand(DSH)
				4501	* field of the ICRL. Logical and physical APIC ID
				4502	* formats are supported. All other IPI types cause
				4503	* a #VMEXIT, which needs to emulated.
				4504	*/
				4505	kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
				4506	kvm_lapic_reg_write(apic, APIC_ICR, icrl);
				4507	break;
				4508	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
				4509	int i;
				4510	struct kvm_vcpu *vcpu;
				4511	struct kvm *kvm = svm->vcpu.kvm;
				4512	struct kvm_lapic *apic = svm->vcpu.arch.apic;
				4513
				4514	/*
				4515	* At this point, we expect that the AVIC HW has already
				4516	* set the appropriate IRR bits on the valid target
				4517	* vcpus. So, we just need to kick the appropriate vcpu.
				4518	*/
				4519	kvm_for_each_vcpu(i, vcpu, kvm) {
				4520	bool m = kvm_apic_match_dest(vcpu, apic,
				4521	icrl & KVM_APIC_SHORT_MASK,
				4522	GET_APIC_DEST_FIELD(icrh),
				4523	icrl & KVM_APIC_DEST_MASK);
				4524
				4525	if (m && !avic_vcpu_is_running(vcpu))
				4526	kvm_vcpu_wake_up(vcpu);
				4527	}
				4528	break;
				4529	}
				4530	case AVIC_IPI_FAILURE_INVALID_TARGET:
				4531	break;
				4532	case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
				4533	WARN_ONCE(1, "Invalid backing page\n");
				4534	break;
				4535	default:
				4536	pr_err("Unknown IPI interception\n");
				4537	}
				4538
				4539	return 1;
				4540	}
				4541
				4542	static u32 avic_get_logical_id_entry(struct kvm_vcpu vcpu, u32 ldr, bool flat)
				4543	{
				4544	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
				4545	int index;
				4546	u32 *logical_apic_id_table;
				4547	int dlid = GET_APIC_LOGICAL_ID(ldr);
				4548
				4549	if (!dlid)
				4550	return NULL;
				4551
				4552	if (flat) { /* flat */
				4553	index = ffs(dlid) - 1;
				4554	if (index > 7)
				4555	return NULL;
				4556	} else { /* cluster */
				4557	int cluster = (dlid & 0xf0) >> 4;
				4558	int apic = ffs(dlid & 0x0f) - 1;
				4559
				4560	if ((apic < 0) \|\| (apic > 7) \|\|
				4561	(cluster >= 0xf))
				4562	return NULL;
				4563	index = (cluster << 2) + apic;
				4564	}
				4565
				4566	logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
				4567
				4568	return &logical_apic_id_table[index];
				4569	}
				4570
				4571	static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
				4572	bool valid)
				4573	{
				4574	bool flat;
				4575	u32 *entry, new_entry;
				4576
				4577	flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
				4578	entry = avic_get_logical_id_entry(vcpu, ldr, flat);
				4579	if (!entry)
				4580	return -EINVAL;
				4581
				4582	new_entry = READ_ONCE(*entry);
				4583	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
				4584	new_entry \|= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
				4585	if (valid)
				4586	new_entry \|= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
				4587	else
				4588	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
				4589	WRITE_ONCE(*entry, new_entry);
				4590
				4591	return 0;
				4592	}
				4593
				4594	static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
				4595	{
				4596	int ret;
				4597	struct vcpu_svm *svm = to_svm(vcpu);
				4598	u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
				4599
				4600	if (!ldr)
				4601	return 1;
				4602
				4603	ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
				4604	if (ret && svm->ldr_reg) {
				4605	avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
				4606	svm->ldr_reg = 0;
				4607	} else {
				4608	svm->ldr_reg = ldr;
				4609	}
				4610	return ret;
				4611	}
				4612
				4613	static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
				4614	{
				4615	u64 old, new;
				4616	struct vcpu_svm *svm = to_svm(vcpu);
				4617	u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
				4618	u32 id = (apic_id_reg >> 24) & 0xff;
				4619
				4620	if (vcpu->vcpu_id == id)
				4621	return 0;
				4622
				4623	old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
				4624	new = avic_get_physical_id_entry(vcpu, id);
				4625	if (!new \|\| !old)
				4626	return 1;
				4627
				4628	/* We need to move physical_id_entry to new offset */
				4629	new = old;
				4630	*old = 0ULL;
				4631	to_svm(vcpu)->avic_physical_id_cache = new;
				4632
				4633	/*
				4634	* Also update the guest physical APIC ID in the logical
				4635	* APIC ID table entry if already setup the LDR.
				4636	*/
				4637	if (svm->ldr_reg)
				4638	avic_handle_ldr_update(vcpu);
				4639
				4640	return 0;
				4641	}
				4642
				4643	static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
				4644	{
				4645	struct vcpu_svm *svm = to_svm(vcpu);
				4646	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
				4647	u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
				4648	u32 mod = (dfr >> 28) & 0xf;
				4649
				4650	/*
				4651	* We assume that all local APICs are using the same type.
				4652	* If this changes, we need to flush the AVIC logical
				4653	* APID id table.
				4654	*/
				4655	if (kvm_svm->ldr_mode == mod)
				4656	return 0;
				4657
				4658	clear_page(page_address(kvm_svm->avic_logical_id_table_page));
				4659	kvm_svm->ldr_mode = mod;
				4660
				4661	if (svm->ldr_reg)
				4662	avic_handle_ldr_update(vcpu);
				4663	return 0;
				4664	}
				4665
				4666	static int avic_unaccel_trap_write(struct vcpu_svm *svm)
				4667	{
				4668	struct kvm_lapic *apic = svm->vcpu.arch.apic;
				4669	u32 offset = svm->vmcb->control.exit_info_1 &
				4670	AVIC_UNACCEL_ACCESS_OFFSET_MASK;
				4671
				4672	switch (offset) {
				4673	case APIC_ID:
				4674	if (avic_handle_apic_id_update(&svm->vcpu))
				4675	return 0;
				4676	break;
				4677	case APIC_LDR:
				4678	if (avic_handle_ldr_update(&svm->vcpu))
				4679	return 0;
				4680	break;
				4681	case APIC_DFR:
				4682	avic_handle_dfr_update(&svm->vcpu);
				4683	break;
				4684	default:
				4685	break;
				4686	}
				4687
				4688	kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
				4689
				4690	return 1;
				4691	}
				4692
				4693	static bool is_avic_unaccelerated_access_trap(u32 offset)
				4694	{
				4695	bool ret = false;
				4696
				4697	switch (offset) {
				4698	case APIC_ID:
				4699	case APIC_EOI:
				4700	case APIC_RRR:
				4701	case APIC_LDR:
				4702	case APIC_DFR:
				4703	case APIC_SPIV:
				4704	case APIC_ESR:
				4705	case APIC_ICR:
				4706	case APIC_LVTT:
				4707	case APIC_LVTTHMR:
				4708	case APIC_LVTPC:
				4709	case APIC_LVT0:
				4710	case APIC_LVT1:
				4711	case APIC_LVTERR:
				4712	case APIC_TMICT:
				4713	case APIC_TDCR:
				4714	ret = true;
				4715	break;
				4716	default:
				4717	break;
				4718	}
				4719	return ret;
				4720	}
				4721
				4722	static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
				4723	{
				4724	int ret = 0;
				4725	u32 offset = svm->vmcb->control.exit_info_1 &
				4726	AVIC_UNACCEL_ACCESS_OFFSET_MASK;
				4727	u32 vector = svm->vmcb->control.exit_info_2 &
				4728	AVIC_UNACCEL_ACCESS_VECTOR_MASK;
				4729	bool write = (svm->vmcb->control.exit_info_1 >> 32) &
				4730	AVIC_UNACCEL_ACCESS_WRITE_MASK;
				4731	bool trap = is_avic_unaccelerated_access_trap(offset);
				4732
				4733	trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
				4734	trap, write, vector);
				4735	if (trap) {
				4736	/* Handling Trap */
				4737	WARN_ONCE(!write, "svm: Handling trap read.\n");
				4738	ret = avic_unaccel_trap_write(svm);
				4739	} else {
				4740	/* Handling Fault */
				4741	ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
				4742	}
				4743
				4744	return ret;
				4745	}
				4746
				4747	static int (const svm_exit_handlers[])(struct vcpu_svm svm) = {
				4748	[SVM_EXIT_READ_CR0] = cr_interception,
				4749	[SVM_EXIT_READ_CR3] = cr_interception,
				4750	[SVM_EXIT_READ_CR4] = cr_interception,
				4751	[SVM_EXIT_READ_CR8] = cr_interception,
				4752	[SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
				4753	[SVM_EXIT_WRITE_CR0] = cr_interception,
				4754	[SVM_EXIT_WRITE_CR3] = cr_interception,
				4755	[SVM_EXIT_WRITE_CR4] = cr_interception,
				4756	[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
				4757	[SVM_EXIT_READ_DR0] = dr_interception,
				4758	[SVM_EXIT_READ_DR1] = dr_interception,
				4759	[SVM_EXIT_READ_DR2] = dr_interception,
				4760	[SVM_EXIT_READ_DR3] = dr_interception,
				4761	[SVM_EXIT_READ_DR4] = dr_interception,
				4762	[SVM_EXIT_READ_DR5] = dr_interception,
				4763	[SVM_EXIT_READ_DR6] = dr_interception,
				4764	[SVM_EXIT_READ_DR7] = dr_interception,
				4765	[SVM_EXIT_WRITE_DR0] = dr_interception,
				4766	[SVM_EXIT_WRITE_DR1] = dr_interception,
				4767	[SVM_EXIT_WRITE_DR2] = dr_interception,
				4768	[SVM_EXIT_WRITE_DR3] = dr_interception,
				4769	[SVM_EXIT_WRITE_DR4] = dr_interception,
				4770	[SVM_EXIT_WRITE_DR5] = dr_interception,
				4771	[SVM_EXIT_WRITE_DR6] = dr_interception,
				4772	[SVM_EXIT_WRITE_DR7] = dr_interception,
				4773	[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
				4774	[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
				4775	[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
				4776	[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
				4777	[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
				4778	[SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
				4779	[SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
				4780	[SVM_EXIT_INTR] = intr_interception,
				4781	[SVM_EXIT_NMI] = nmi_interception,
				4782	[SVM_EXIT_SMI] = nop_on_interception,
				4783	[SVM_EXIT_INIT] = nop_on_interception,
				4784	[SVM_EXIT_VINTR] = interrupt_window_interception,
				4785	[SVM_EXIT_RDPMC] = rdpmc_interception,
				4786	[SVM_EXIT_CPUID] = cpuid_interception,
				4787	[SVM_EXIT_IRET] = iret_interception,
				4788	[SVM_EXIT_INVD] = emulate_on_interception,
				4789	[SVM_EXIT_PAUSE] = pause_interception,
				4790	[SVM_EXIT_HLT] = halt_interception,
				4791	[SVM_EXIT_INVLPG] = invlpg_interception,
				4792	[SVM_EXIT_INVLPGA] = invlpga_interception,
				4793	[SVM_EXIT_IOIO] = io_interception,
				4794	[SVM_EXIT_MSR] = msr_interception,
				4795	[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
				4796	[SVM_EXIT_SHUTDOWN] = shutdown_interception,
				4797	[SVM_EXIT_VMRUN] = vmrun_interception,
				4798	[SVM_EXIT_VMMCALL] = vmmcall_interception,
				4799	[SVM_EXIT_VMLOAD] = vmload_interception,
				4800	[SVM_EXIT_VMSAVE] = vmsave_interception,
				4801	[SVM_EXIT_STGI] = stgi_interception,
				4802	[SVM_EXIT_CLGI] = clgi_interception,
				4803	[SVM_EXIT_SKINIT] = skinit_interception,
				4804	[SVM_EXIT_WBINVD] = wbinvd_interception,
				4805	[SVM_EXIT_MONITOR] = monitor_interception,
				4806	[SVM_EXIT_MWAIT] = mwait_interception,
				4807	[SVM_EXIT_XSETBV] = xsetbv_interception,
				4808	[SVM_EXIT_NPF] = npf_interception,
				4809	[SVM_EXIT_RSM] = rsm_interception,
				4810	[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
				4811	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
				4812	};
				4813
				4814	static void dump_vmcb(struct kvm_vcpu *vcpu)
				4815	{
				4816	struct vcpu_svm *svm = to_svm(vcpu);
				4817	struct vmcb_control_area *control = &svm->vmcb->control;
				4818	struct vmcb_save_area *save = &svm->vmcb->save;
				4819
				4820	pr_err("VMCB Control Area:\n");
				4821	pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
				4822	pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
				4823	pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
				4824	pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
				4825	pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
				4826	pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
				4827	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
				4828	pr_err("%-20s%d\n", "pause filter threshold:",
				4829	control->pause_filter_thresh);
				4830	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
				4831	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
				4832	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
				4833	pr_err("%-20s%d\n", "asid:", control->asid);
				4834	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
				4835	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
				4836	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
				4837	pr_err("%-20s%08x\n", "int_state:", control->int_state);
				4838	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
				4839	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
				4840	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
				4841	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
				4842	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
				4843	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
				4844	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
				4845	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
				4846	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
				4847	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
				4848	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
				4849	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
				4850	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
				4851	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
				4852	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
				4853	pr_err("VMCB State Save Area:\n");
				4854	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4855	"es:",
				4856	save->es.selector, save->es.attrib,
				4857	save->es.limit, save->es.base);
				4858	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4859	"cs:",
				4860	save->cs.selector, save->cs.attrib,
				4861	save->cs.limit, save->cs.base);
				4862	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4863	"ss:",
				4864	save->ss.selector, save->ss.attrib,
				4865	save->ss.limit, save->ss.base);
				4866	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4867	"ds:",
				4868	save->ds.selector, save->ds.attrib,
				4869	save->ds.limit, save->ds.base);
				4870	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4871	"fs:",
				4872	save->fs.selector, save->fs.attrib,
				4873	save->fs.limit, save->fs.base);
				4874	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4875	"gs:",
				4876	save->gs.selector, save->gs.attrib,
				4877	save->gs.limit, save->gs.base);
				4878	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4879	"gdtr:",
				4880	save->gdtr.selector, save->gdtr.attrib,
				4881	save->gdtr.limit, save->gdtr.base);
				4882	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4883	"ldtr:",
				4884	save->ldtr.selector, save->ldtr.attrib,
				4885	save->ldtr.limit, save->ldtr.base);
				4886	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4887	"idtr:",
				4888	save->idtr.selector, save->idtr.attrib,
				4889	save->idtr.limit, save->idtr.base);
				4890	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
				4891	"tr:",
				4892	save->tr.selector, save->tr.attrib,
				4893	save->tr.limit, save->tr.base);
				4894	pr_err("cpl: %d efer: %016llx\n",
				4895	save->cpl, save->efer);
				4896	pr_err("%-15s %016llx %-13s %016llx\n",
				4897	"cr0:", save->cr0, "cr2:", save->cr2);
				4898	pr_err("%-15s %016llx %-13s %016llx\n",
				4899	"cr3:", save->cr3, "cr4:", save->cr4);
				4900	pr_err("%-15s %016llx %-13s %016llx\n",
				4901	"dr6:", save->dr6, "dr7:", save->dr7);
				4902	pr_err("%-15s %016llx %-13s %016llx\n",
				4903	"rip:", save->rip, "rflags:", save->rflags);
				4904	pr_err("%-15s %016llx %-13s %016llx\n",
				4905	"rsp:", save->rsp, "rax:", save->rax);
				4906	pr_err("%-15s %016llx %-13s %016llx\n",
				4907	"star:", save->star, "lstar:", save->lstar);
				4908	pr_err("%-15s %016llx %-13s %016llx\n",
				4909	"cstar:", save->cstar, "sfmask:", save->sfmask);
				4910	pr_err("%-15s %016llx %-13s %016llx\n",
				4911	"kernel_gs_base:", save->kernel_gs_base,
				4912	"sysenter_cs:", save->sysenter_cs);
				4913	pr_err("%-15s %016llx %-13s %016llx\n",
				4914	"sysenter_esp:", save->sysenter_esp,
				4915	"sysenter_eip:", save->sysenter_eip);
				4916	pr_err("%-15s %016llx %-13s %016llx\n",
				4917	"gpat:", save->g_pat, "dbgctl:", save->dbgctl);
				4918	pr_err("%-15s %016llx %-13s %016llx\n",
				4919	"br_from:", save->br_from, "br_to:", save->br_to);
				4920	pr_err("%-15s %016llx %-13s %016llx\n",
				4921	"excp_from:", save->last_excp_from,
				4922	"excp_to:", save->last_excp_to);
				4923	}
				4924
				4925	static void svm_get_exit_info(struct kvm_vcpu vcpu, u64 info1, u64 *info2)
				4926	{
				4927	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
				4928
				4929	*info1 = control->exit_info_1;
				4930	*info2 = control->exit_info_2;
				4931	}
				4932
				4933	static int handle_exit(struct kvm_vcpu *vcpu)
				4934	{
				4935	struct vcpu_svm *svm = to_svm(vcpu);
				4936	struct kvm_run *kvm_run = vcpu->run;
				4937	u32 exit_code = svm->vmcb->control.exit_code;
				4938
				4939	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
				4940
				4941	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
				4942	vcpu->arch.cr0 = svm->vmcb->save.cr0;
				4943	if (npt_enabled)
				4944	vcpu->arch.cr3 = svm->vmcb->save.cr3;
				4945
				4946	if (unlikely(svm->nested.exit_required)) {
				4947	nested_svm_vmexit(svm);
				4948	svm->nested.exit_required = false;
				4949
				4950	return 1;
				4951	}
				4952
				4953	if (is_guest_mode(vcpu)) {
				4954	int vmexit;
				4955
				4956	trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
				4957	svm->vmcb->control.exit_info_1,
				4958	svm->vmcb->control.exit_info_2,
				4959	svm->vmcb->control.exit_int_info,
				4960	svm->vmcb->control.exit_int_info_err,
				4961	KVM_ISA_SVM);
				4962
				4963	vmexit = nested_svm_exit_special(svm);
				4964
				4965	if (vmexit == NESTED_EXIT_CONTINUE)
				4966	vmexit = nested_svm_exit_handled(svm);
				4967
				4968	if (vmexit == NESTED_EXIT_DONE)
				4969	return 1;
				4970	}
				4971
				4972	svm_complete_interrupts(svm);
				4973
				4974	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
				4975	kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
				4976	kvm_run->fail_entry.hardware_entry_failure_reason
				4977	= svm->vmcb->control.exit_code;
				4978	pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
				4979	dump_vmcb(vcpu);
				4980	return 0;
				4981	}
				4982
				4983	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
				4984	exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
				4985	exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
				4986	exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
				4987	printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
				4988	"exit_code 0x%x\n",
				4989	__func__, svm->vmcb->control.exit_int_info,
				4990	exit_code);
				4991
				4992	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
				4993	\|\| !svm_exit_handlers[exit_code]) {
				4994	WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
				4995	kvm_queue_exception(vcpu, UD_VECTOR);
				4996	return 1;
				4997	}
				4998
				4999	return svm_exit_handlers[exit_code](svm);
				5000	}
				5001
				5002	static void reload_tss(struct kvm_vcpu *vcpu)
				5003	{
				5004	int cpu = raw_smp_processor_id();
				5005
				5006	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
				5007	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
				5008	load_TR_desc();
				5009	}
				5010
				5011	static void pre_sev_run(struct vcpu_svm *svm, int cpu)
				5012	{
				5013	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
				5014	int asid = sev_get_asid(svm->vcpu.kvm);
				5015
				5016	/* Assign the asid allocated with this SEV guest */
				5017	svm->vmcb->control.asid = asid;
				5018
				5019	/*
				5020	* Flush guest TLB:
				5021	*
				5022	* 1) when different VMCB for the same ASID is to be run on the same host CPU.
				5023	* 2) or this VMCB was executed on different host CPU in previous VMRUNs.
				5024	*/
				5025	if (sd->sev_vmcbs[asid] == svm->vmcb &&
				5026	svm->last_cpu == cpu)
				5027	return;
				5028
				5029	svm->last_cpu = cpu;
				5030	sd->sev_vmcbs[asid] = svm->vmcb;
				5031	svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
				5032	mark_dirty(svm->vmcb, VMCB_ASID);
				5033	}
				5034
				5035	static void pre_svm_run(struct vcpu_svm *svm)
				5036	{
				5037	int cpu = raw_smp_processor_id();
				5038
				5039	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
				5040
				5041	if (sev_guest(svm->vcpu.kvm))
				5042	return pre_sev_run(svm, cpu);
				5043
				5044	/* FIXME: handle wraparound of asid_generation */
				5045	if (svm->asid_generation != sd->asid_generation)
				5046	new_asid(svm, sd);
				5047	}
				5048
				5049	static void svm_inject_nmi(struct kvm_vcpu *vcpu)
				5050	{
				5051	struct vcpu_svm *svm = to_svm(vcpu);
				5052
				5053	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID \| SVM_EVTINJ_TYPE_NMI;
				5054	vcpu->arch.hflags \|= HF_NMI_MASK;
				5055	set_intercept(svm, INTERCEPT_IRET);
				5056	++vcpu->stat.nmi_injections;
				5057	}
				5058
				5059	static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
				5060	{
				5061	struct vmcb_control_area *control;
				5062
				5063	/* The following fields are ignored when AVIC is enabled */
				5064	control = &svm->vmcb->control;
				5065	control->int_vector = irq;
				5066	control->int_ctl &= ~V_INTR_PRIO_MASK;
				5067	control->int_ctl \|= V_IRQ_MASK \|
				5068	((/control->int_vector >> 4/ 0xf) << V_INTR_PRIO_SHIFT);
				5069	mark_dirty(svm->vmcb, VMCB_INTR);
				5070	}
				5071
				5072	static void svm_set_irq(struct kvm_vcpu *vcpu)
				5073	{
				5074	struct vcpu_svm *svm = to_svm(vcpu);
				5075
				5076	BUG_ON(!(gif_set(svm)));
				5077
				5078	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
				5079	++vcpu->stat.irq_injections;
				5080
				5081	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr \|
				5082	SVM_EVTINJ_VALID \| SVM_EVTINJ_TYPE_INTR;
				5083	}
				5084
				5085	static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
				5086	{
				5087	return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
				5088	}
				5089
				5090	static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
				5091	{
				5092	struct vcpu_svm *svm = to_svm(vcpu);
				5093
				5094	if (svm_nested_virtualize_tpr(vcpu) \|\|
				5095	kvm_vcpu_apicv_active(vcpu))
				5096	return;
				5097
				5098	clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
				5099
				5100	if (irr == -1)
				5101	return;
				5102
				5103	if (tpr >= irr)
				5104	set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
				5105	}
				5106
				5107	static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
				5108	{
				5109	return;
				5110	}
				5111
				5112	static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
				5113	{
				5114	return avic && irqchip_split(vcpu->kvm);
				5115	}
				5116
				5117	static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
				5118	{
				5119	}
				5120
				5121	static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
				5122	{
				5123	}
				5124
				5125	/* Note: Currently only used by Hyper-V. */
				5126	static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
				5127	{
				5128	struct vcpu_svm *svm = to_svm(vcpu);
				5129	struct vmcb *vmcb = svm->vmcb;
				5130
				5131	if (!kvm_vcpu_apicv_active(&svm->vcpu))
				5132	return;
				5133
				5134	vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
				5135	mark_dirty(vmcb, VMCB_INTR);
				5136	}
				5137
				5138	static void svm_load_eoi_exitmap(struct kvm_vcpu vcpu, u64 eoi_exit_bitmap)
				5139	{
				5140	return;
				5141	}
				5142
				5143	static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
				5144	{
				5145	kvm_lapic_set_irr(vec, vcpu->arch.apic);
				5146	smp_mb__after_atomic();
				5147
				5148	if (avic_vcpu_is_running(vcpu))
				5149	wrmsrl(SVM_AVIC_DOORBELL,
				5150	kvm_cpu_get_apicid(vcpu->cpu));
				5151	else
				5152	kvm_vcpu_wake_up(vcpu);
				5153	}
				5154
				5155	static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
				5156	{
				5157	return false;
				5158	}
				5159
				5160	static void svm_ir_list_del(struct vcpu_svm svm, struct amd_iommu_pi_data pi)
				5161	{
				5162	unsigned long flags;
				5163	struct amd_svm_iommu_ir *cur;
				5164
				5165	spin_lock_irqsave(&svm->ir_list_lock, flags);
				5166	list_for_each_entry(cur, &svm->ir_list, node) {
				5167	if (cur->data != pi->ir_data)
				5168	continue;
				5169	list_del(&cur->node);
				5170	kfree(cur);
				5171	break;
				5172	}
				5173	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
				5174	}
				5175
				5176	static int svm_ir_list_add(struct vcpu_svm svm, struct amd_iommu_pi_data pi)
				5177	{
				5178	int ret = 0;
				5179	unsigned long flags;
				5180	struct amd_svm_iommu_ir *ir;
				5181
				5182	/**
				5183	* In some cases, the existing irte is updaed and re-set,
				5184	* so we need to check here if it's already been * added
				5185	* to the ir_list.
				5186	*/
				5187	if (pi->ir_data && (pi->prev_ga_tag != 0)) {
				5188	struct kvm *kvm = svm->vcpu.kvm;
				5189	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
				5190	struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
				5191	struct vcpu_svm *prev_svm;
				5192
				5193	if (!prev_vcpu) {
				5194	ret = -EINVAL;
				5195	goto out;
				5196	}
				5197
				5198	prev_svm = to_svm(prev_vcpu);
				5199	svm_ir_list_del(prev_svm, pi);
				5200	}
				5201
				5202	/**
				5203	* Allocating new amd_iommu_pi_data, which will get
				5204	* add to the per-vcpu ir_list.
				5205	*/
				5206	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
				5207	if (!ir) {
				5208	ret = -ENOMEM;
				5209	goto out;
				5210	}
				5211	ir->data = pi->ir_data;
				5212
				5213	spin_lock_irqsave(&svm->ir_list_lock, flags);
				5214	list_add(&ir->node, &svm->ir_list);
				5215	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
				5216	out:
				5217	return ret;
				5218	}
				5219
				5220	/**
				5221	* Note:
				5222	* The HW cannot support posting multicast/broadcast
				5223	* interrupts to a vCPU. So, we still use legacy interrupt
				5224	* remapping for these kind of interrupts.
				5225	*
				5226	* For lowest-priority interrupts, we only support
				5227	* those with single CPU as the destination, e.g. user
				5228	* configures the interrupts via /proc/irq or uses
				5229	* irqbalance to make the interrupts single-CPU.
				5230	*/
				5231	static int
				5232	get_pi_vcpu_info(struct kvm kvm, struct kvm_kernel_irq_routing_entry e,
				5233	struct vcpu_data vcpu_info, struct vcpu_svm *svm)
				5234	{
				5235	struct kvm_lapic_irq irq;
				5236	struct kvm_vcpu *vcpu = NULL;
				5237
				5238	kvm_set_msi_irq(kvm, e, &irq);
				5239
				5240	if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
				5241	pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
				5242	__func__, irq.vector);
				5243	return -1;
				5244	}
				5245
				5246	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
				5247	irq.vector);
				5248	*svm = to_svm(vcpu);
				5249	vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
				5250	vcpu_info->vector = irq.vector;
				5251
				5252	return 0;
				5253	}
				5254
				5255	/*
				5256	* svm_update_pi_irte - set IRTE for Posted-Interrupts
				5257	*
				5258	* @kvm: kvm
				5259	* @host_irq: host irq of the interrupt
				5260	* @guest_irq: gsi of the interrupt
				5261	* @set: set or unset PI
				5262	* returns 0 on success, < 0 on failure
				5263	*/
				5264	static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
				5265	uint32_t guest_irq, bool set)
				5266	{
				5267	struct kvm_kernel_irq_routing_entry *e;
				5268	struct kvm_irq_routing_table *irq_rt;
				5269	int idx, ret = -EINVAL;
				5270
				5271	if (!kvm_arch_has_assigned_device(kvm) \|\|
				5272	!irq_remapping_cap(IRQ_POSTING_CAP))
				5273	return 0;
				5274
				5275	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
				5276	__func__, host_irq, guest_irq, set);
				5277
				5278	idx = srcu_read_lock(&kvm->irq_srcu);
				5279	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
				5280	WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
				5281
				5282	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
				5283	struct vcpu_data vcpu_info;
				5284	struct vcpu_svm *svm = NULL;
				5285
				5286	if (e->type != KVM_IRQ_ROUTING_MSI)
				5287	continue;
				5288
				5289	/**
				5290	* Here, we setup with legacy mode in the following cases:
				5291	* 1. When cannot target interrupt to a specific vcpu.
				5292	* 2. Unsetting posted interrupt.
				5293	* 3. APIC virtialization is disabled for the vcpu.
				5294	*/
				5295	if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
				5296	kvm_vcpu_apicv_active(&svm->vcpu)) {
				5297	struct amd_iommu_pi_data pi;
				5298
				5299	/* Try to enable guest_mode in IRTE */
				5300	pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
				5301	AVIC_HPA_MASK);
				5302	pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
				5303	svm->vcpu.vcpu_id);
				5304	pi.is_guest_mode = true;
				5305	pi.vcpu_data = &vcpu_info;
				5306	ret = irq_set_vcpu_affinity(host_irq, &pi);
				5307
				5308	/**
				5309	* Here, we successfully setting up vcpu affinity in
				5310	* IOMMU guest mode. Now, we need to store the posted
				5311	* interrupt information in a per-vcpu ir_list so that
				5312	* we can reference to them directly when we update vcpu
				5313	* scheduling information in IOMMU irte.
				5314	*/
				5315	if (!ret && pi.is_guest_mode)
				5316	svm_ir_list_add(svm, &pi);
				5317	} else {
				5318	/* Use legacy mode in IRTE */
				5319	struct amd_iommu_pi_data pi;
				5320
				5321	/**
				5322	* Here, pi is used to:
				5323	* - Tell IOMMU to use legacy mode for this interrupt.
				5324	* - Retrieve ga_tag of prior interrupt remapping data.
				5325	*/
				5326	pi.is_guest_mode = false;
				5327	ret = irq_set_vcpu_affinity(host_irq, &pi);
				5328
				5329	/**
				5330	* Check if the posted interrupt was previously
				5331	* setup with the guest_mode by checking if the ga_tag
				5332	* was cached. If so, we need to clean up the per-vcpu
				5333	* ir_list.
				5334	*/
				5335	if (!ret && pi.prev_ga_tag) {
				5336	int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
				5337	struct kvm_vcpu *vcpu;
				5338
				5339	vcpu = kvm_get_vcpu_by_id(kvm, id);
				5340	if (vcpu)
				5341	svm_ir_list_del(to_svm(vcpu), &pi);
				5342	}
				5343	}
				5344
				5345	if (!ret && svm) {
				5346	trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
				5347	e->gsi, vcpu_info.vector,
				5348	vcpu_info.pi_desc_addr, set);
				5349	}
				5350
				5351	if (ret < 0) {
				5352	pr_err("%s: failed to update PI IRTE\n", __func__);
				5353	goto out;
				5354	}
				5355	}
				5356
				5357	ret = 0;
				5358	out:
				5359	srcu_read_unlock(&kvm->irq_srcu, idx);
				5360	return ret;
				5361	}
				5362
				5363	static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
				5364	{
				5365	struct vcpu_svm *svm = to_svm(vcpu);
				5366	struct vmcb *vmcb = svm->vmcb;
				5367	int ret;
				5368	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
				5369	!(svm->vcpu.arch.hflags & HF_NMI_MASK);
				5370	ret = ret && gif_set(svm) && nested_svm_nmi(svm);
				5371
				5372	return ret;
				5373	}
				5374
				5375	static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
				5376	{
				5377	struct vcpu_svm *svm = to_svm(vcpu);
				5378
				5379	return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
				5380	}
				5381
				5382	static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
				5383	{
				5384	struct vcpu_svm *svm = to_svm(vcpu);
				5385
				5386	if (masked) {
				5387	svm->vcpu.arch.hflags \|= HF_NMI_MASK;
				5388	set_intercept(svm, INTERCEPT_IRET);
				5389	} else {
				5390	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
				5391	clr_intercept(svm, INTERCEPT_IRET);
				5392	}
				5393	}
				5394
				5395	static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
				5396	{
				5397	struct vcpu_svm *svm = to_svm(vcpu);
				5398	struct vmcb *vmcb = svm->vmcb;
				5399	int ret;
				5400
				5401	if (!gif_set(svm) \|\|
				5402	(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
				5403	return 0;
				5404
				5405	ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
				5406
				5407	if (is_guest_mode(vcpu))
				5408	return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
				5409
				5410	return ret;
				5411	}
				5412
				5413	static void enable_irq_window(struct kvm_vcpu *vcpu)
				5414	{
				5415	struct vcpu_svm *svm = to_svm(vcpu);
				5416
				5417	if (kvm_vcpu_apicv_active(vcpu))
				5418	return;
				5419
				5420	/*
				5421	* In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
				5422	* 1, because that's a separate STGI/VMRUN intercept. The next time we
				5423	* get that intercept, this function will be called again though and
				5424	* we'll get the vintr intercept. However, if the vGIF feature is
				5425	* enabled, the STGI interception will not occur. Enable the irq
				5426	* window under the assumption that the hardware will set the GIF.
				5427	*/
				5428	if ((vgif_enabled(svm) \|\| gif_set(svm)) && nested_svm_intr(svm)) {
				5429	svm_set_vintr(svm);
				5430	svm_inject_irq(svm, 0x0);
				5431	}
				5432	}
				5433
				5434	static void enable_nmi_window(struct kvm_vcpu *vcpu)
				5435	{
				5436	struct vcpu_svm *svm = to_svm(vcpu);
				5437
				5438	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK \| HF_IRET_MASK))
				5439	== HF_NMI_MASK)
				5440	return; /* IRET will cause a vm exit */
				5441
				5442	if (!gif_set(svm)) {
				5443	if (vgif_enabled(svm))
				5444	set_intercept(svm, INTERCEPT_STGI);
				5445	return; /* STGI will cause a vm exit */
				5446	}
				5447
				5448	if (svm->nested.exit_required)
				5449	return; /* we're not going to run the guest yet */
				5450
				5451	/*
				5452	* Something prevents NMI from been injected. Single step over possible
				5453	* problem (IRET or exception injection or interrupt shadow)
				5454	*/
				5455	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
				5456	svm->nmi_singlestep = true;
				5457	svm->vmcb->save.rflags \|= (X86_EFLAGS_TF \| X86_EFLAGS_RF);
				5458	}
				5459
				5460	static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
				5461	{
				5462	return 0;
				5463	}
				5464
				5465	static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
				5466	{
				5467	return 0;
				5468	}
				5469
				5470	static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
				5471	{
				5472	struct vcpu_svm *svm = to_svm(vcpu);
				5473
				5474	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
				5475	svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
				5476	else
				5477	svm->asid_generation--;
				5478	}
				5479
				5480	static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
				5481	{
				5482	struct vcpu_svm *svm = to_svm(vcpu);
				5483
				5484	invlpga(gva, svm->vmcb->control.asid);
				5485	}
				5486
				5487	static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
				5488	{
				5489	}
				5490
				5491	static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
				5492	{
				5493	struct vcpu_svm *svm = to_svm(vcpu);
				5494
				5495	if (svm_nested_virtualize_tpr(vcpu))
				5496	return;
				5497
				5498	if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
				5499	int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
				5500	kvm_set_cr8(vcpu, cr8);
				5501	}
				5502	}
				5503
				5504	static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
				5505	{
				5506	struct vcpu_svm *svm = to_svm(vcpu);
				5507	u64 cr8;
				5508
				5509	if (svm_nested_virtualize_tpr(vcpu) \|\|
				5510	kvm_vcpu_apicv_active(vcpu))
				5511	return;
				5512
				5513	cr8 = kvm_get_cr8(vcpu);
				5514	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
				5515	svm->vmcb->control.int_ctl \|= cr8 & V_TPR_MASK;
				5516	}
				5517
				5518	static void svm_complete_interrupts(struct vcpu_svm *svm)
				5519	{
				5520	u8 vector;
				5521	int type;
				5522	u32 exitintinfo = svm->vmcb->control.exit_int_info;
				5523	unsigned int3_injected = svm->int3_injected;
				5524
				5525	svm->int3_injected = 0;
				5526
				5527	/*
				5528	* If we've made progress since setting HF_IRET_MASK, we've
				5529	* executed an IRET and can allow NMI injection.
				5530	*/
				5531	if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
				5532	&& kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
				5533	svm->vcpu.arch.hflags &= ~(HF_NMI_MASK \| HF_IRET_MASK);
				5534	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
				5535	}
				5536
				5537	svm->vcpu.arch.nmi_injected = false;
				5538	kvm_clear_exception_queue(&svm->vcpu);
				5539	kvm_clear_interrupt_queue(&svm->vcpu);
				5540
				5541	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
				5542	return;
				5543
				5544	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
				5545
				5546	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
				5547	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
				5548
				5549	switch (type) {
				5550	case SVM_EXITINTINFO_TYPE_NMI:
				5551	svm->vcpu.arch.nmi_injected = true;
				5552	break;
				5553	case SVM_EXITINTINFO_TYPE_EXEPT:
				5554	/*
				5555	* In case of software exceptions, do not reinject the vector,
				5556	* but re-execute the instruction instead. Rewind RIP first
				5557	* if we emulated INT3 before.
				5558	*/
				5559	if (kvm_exception_is_soft(vector)) {
				5560	if (vector == BP_VECTOR && int3_injected &&
				5561	kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
				5562	kvm_rip_write(&svm->vcpu,
				5563	kvm_rip_read(&svm->vcpu) -
				5564	int3_injected);
				5565	break;
				5566	}
				5567	if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
				5568	u32 err = svm->vmcb->control.exit_int_info_err;
				5569	kvm_requeue_exception_e(&svm->vcpu, vector, err);
				5570
				5571	} else
				5572	kvm_requeue_exception(&svm->vcpu, vector);
				5573	break;
				5574	case SVM_EXITINTINFO_TYPE_INTR:
				5575	kvm_queue_interrupt(&svm->vcpu, vector, false);
				5576	break;
				5577	default:
				5578	break;
				5579	}
				5580	}
				5581
				5582	static void svm_cancel_injection(struct kvm_vcpu *vcpu)
				5583	{
				5584	struct vcpu_svm *svm = to_svm(vcpu);
				5585	struct vmcb_control_area *control = &svm->vmcb->control;
				5586
				5587	control->exit_int_info = control->event_inj;
				5588	control->exit_int_info_err = control->event_inj_err;
				5589	control->event_inj = 0;
				5590	svm_complete_interrupts(svm);
				5591	}
				5592
				5593	static void svm_vcpu_run(struct kvm_vcpu *vcpu)
				5594	{
				5595	struct vcpu_svm *svm = to_svm(vcpu);
				5596
				5597	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
				5598	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
				5599	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
				5600
				5601	/*
				5602	* A vmexit emulation is required before the vcpu can be executed
				5603	* again.
				5604	*/
				5605	if (unlikely(svm->nested.exit_required))
				5606	return;
				5607
				5608	/*
				5609	* Disable singlestep if we're injecting an interrupt/exception.
				5610	* We don't want our modified rflags to be pushed on the stack where
				5611	* we might not be able to easily reset them if we disabled NMI
				5612	* singlestep later.
				5613	*/
				5614	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
				5615	/*
				5616	* Event injection happens before external interrupts cause a
				5617	* vmexit and interrupts are disabled here, so smp_send_reschedule
				5618	* is enough to force an immediate vmexit.
				5619	*/
				5620	disable_nmi_singlestep(svm);
				5621	smp_send_reschedule(vcpu->cpu);
				5622	}
				5623
				5624	pre_svm_run(svm);
				5625
				5626	sync_lapic_to_cr8(vcpu);
				5627
				5628	svm->vmcb->save.cr2 = vcpu->arch.cr2;
				5629
				5630	clgi();
				5631	kvm_load_guest_xcr0(vcpu);
				5632
				5633	/*
				5634	* If this vCPU has touched SPEC_CTRL, restore the guest's value if
				5635	* it's non-zero. Since vmentry is serialising on affected CPUs, there
				5636	* is no need to worry about the conditional branch over the wrmsr
				5637	* being speculatively taken.
				5638	*/
				5639	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
				5640
				5641	local_irq_enable();
				5642
				5643	asm volatile (
				5644	"push %%" _ASM_BP "; \n\t"
				5645	"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
				5646	"mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
				5647	"mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
				5648	"mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
				5649	"mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
				5650	"mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
				5651	#ifdef CONFIG_X86_64
				5652	"mov %c[r8](%[svm]), %%r8 \n\t"
				5653	"mov %c[r9](%[svm]), %%r9 \n\t"
				5654	"mov %c[r10](%[svm]), %%r10 \n\t"
				5655	"mov %c[r11](%[svm]), %%r11 \n\t"
				5656	"mov %c[r12](%[svm]), %%r12 \n\t"
				5657	"mov %c[r13](%[svm]), %%r13 \n\t"
				5658	"mov %c[r14](%[svm]), %%r14 \n\t"
				5659	"mov %c[r15](%[svm]), %%r15 \n\t"
				5660	#endif
				5661
				5662	/* Enter guest mode */
				5663	"push %%" _ASM_AX " \n\t"
				5664	"mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
				5665	__ex(SVM_VMLOAD) "\n\t"
				5666	__ex(SVM_VMRUN) "\n\t"
				5667	__ex(SVM_VMSAVE) "\n\t"
				5668	"pop %%" _ASM_AX " \n\t"
				5669
				5670	/* Save guest registers, load host registers */
				5671	"mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
				5672	"mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
				5673	"mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
				5674	"mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
				5675	"mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
				5676	"mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
				5677	#ifdef CONFIG_X86_64
				5678	"mov %%r8, %c[r8](%[svm]) \n\t"
				5679	"mov %%r9, %c[r9](%[svm]) \n\t"
				5680	"mov %%r10, %c[r10](%[svm]) \n\t"
				5681	"mov %%r11, %c[r11](%[svm]) \n\t"
				5682	"mov %%r12, %c[r12](%[svm]) \n\t"
				5683	"mov %%r13, %c[r13](%[svm]) \n\t"
				5684	"mov %%r14, %c[r14](%[svm]) \n\t"
				5685	"mov %%r15, %c[r15](%[svm]) \n\t"
				5686	#endif
				5687	/*
				5688	* Clear host registers marked as clobbered to prevent
				5689	* speculative use.
				5690	*/
				5691	"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
				5692	"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
				5693	"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
				5694	"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
				5695	"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
				5696	#ifdef CONFIG_X86_64
				5697	"xor %%r8, %%r8 \n\t"
				5698	"xor %%r9, %%r9 \n\t"
				5699	"xor %%r10, %%r10 \n\t"
				5700	"xor %%r11, %%r11 \n\t"
				5701	"xor %%r12, %%r12 \n\t"
				5702	"xor %%r13, %%r13 \n\t"
				5703	"xor %%r14, %%r14 \n\t"
				5704	"xor %%r15, %%r15 \n\t"
				5705	#endif
				5706	"pop %%" _ASM_BP
				5707	:
				5708	: [svm]"a"(svm),
				5709	[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
				5710	[rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
				5711	[rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
				5712	[rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
				5713	[rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
				5714	[rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
				5715	[rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
				5716	#ifdef CONFIG_X86_64
				5717	, [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
				5718	[r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
				5719	[r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
				5720	[r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
				5721	[r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
				5722	[r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
				5723	[r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
				5724	[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
				5725	#endif
				5726	: "cc", "memory"
				5727	#ifdef CONFIG_X86_64
				5728	, "rbx", "rcx", "rdx", "rsi", "rdi"
				5729	, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
				5730	#else
				5731	, "ebx", "ecx", "edx", "esi", "edi"
				5732	#endif
				5733	);
				5734
				5735	/* Eliminate branch target predictions from guest mode */
				5736	vmexit_fill_RSB();
				5737
				5738	#ifdef CONFIG_X86_64
				5739	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
				5740	#else
				5741	loadsegment(fs, svm->host.fs);
				5742	#ifndef CONFIG_X86_32_LAZY_GS
				5743	loadsegment(gs, svm->host.gs);
				5744	#endif
				5745	#endif
				5746
				5747	/*
				5748	* We do not use IBRS in the kernel. If this vCPU has used the
				5749	* SPEC_CTRL MSR it may have left it on; save the value and
				5750	* turn it off. This is much more efficient than blindly adding
				5751	* it to the atomic save/restore list. Especially as the former
				5752	* (Saving guest MSRs on vmexit) doesn't even exist in KVM.
				5753	*
				5754	* For non-nested case:
				5755	* If the L01 MSR bitmap does not intercept the MSR, then we need to
				5756	* save it.
				5757	*
				5758	* For nested case:
				5759	* If the L02 MSR bitmap does not intercept the MSR, then we need to
				5760	* save it.
				5761	*/
				5762	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
				5763	svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
				5764
				5765	reload_tss(vcpu);
				5766
				5767	local_irq_disable();
				5768
				5769	x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
				5770
				5771	vcpu->arch.cr2 = svm->vmcb->save.cr2;
				5772	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
				5773	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
				5774	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
				5775
				5776	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
				5777	kvm_before_interrupt(&svm->vcpu);
				5778
				5779	kvm_put_guest_xcr0(vcpu);
				5780	stgi();
				5781
				5782	/* Any pending NMI will happen here */
				5783
				5784	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
				5785	kvm_after_interrupt(&svm->vcpu);
				5786
				5787	sync_cr8_to_lapic(vcpu);
				5788
				5789	svm->next_rip = 0;
				5790
				5791	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
				5792
				5793	/* if exit due to PF check for async PF */
				5794	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
				5795	svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
				5796
				5797	if (npt_enabled) {
				5798	vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
				5799	vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
				5800	}
				5801
				5802	/*
				5803	* We need to handle MC intercepts here before the vcpu has a chance to
				5804	* change the physical cpu
				5805	*/
				5806	if (unlikely(svm->vmcb->control.exit_code ==
				5807	SVM_EXIT_EXCP_BASE + MC_VECTOR))
				5808	svm_handle_mce(svm);
				5809
				5810	mark_all_clean(svm->vmcb);
				5811	}
				5812	STACK_FRAME_NON_STANDARD(svm_vcpu_run);
				5813
				5814	static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
				5815	{
				5816	struct vcpu_svm *svm = to_svm(vcpu);
				5817
				5818	svm->vmcb->save.cr3 = __sme_set(root);
				5819	mark_dirty(svm->vmcb, VMCB_CR);
				5820	}
				5821
				5822	static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
				5823	{
				5824	struct vcpu_svm *svm = to_svm(vcpu);
				5825
				5826	svm->vmcb->control.nested_cr3 = __sme_set(root);
				5827	mark_dirty(svm->vmcb, VMCB_NPT);
				5828
				5829	/* Also sync guest cr3 here in case we live migrate */
				5830	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
				5831	mark_dirty(svm->vmcb, VMCB_CR);
				5832	}
				5833
				5834	static int is_disabled(void)
				5835	{
				5836	u64 vm_cr;
				5837
				5838	rdmsrl(MSR_VM_CR, vm_cr);
				5839	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
				5840	return 1;
				5841
				5842	return 0;
				5843	}
				5844
				5845	static void
				5846	svm_patch_hypercall(struct kvm_vcpu vcpu, unsigned char hypercall)
				5847	{
				5848	/*
				5849	* Patch in the VMMCALL instruction:
				5850	*/
				5851	hypercall[0] = 0x0f;
				5852	hypercall[1] = 0x01;
				5853	hypercall[2] = 0xd9;
				5854	}
				5855
				5856	static void svm_check_processor_compat(void *rtn)
				5857	{
				5858	(int )rtn = 0;
				5859	}
				5860
				5861	static bool svm_cpu_has_accelerated_tpr(void)
				5862	{
				5863	return false;
				5864	}
				5865
				5866	static bool svm_has_emulated_msr(int index)
				5867	{
				5868	switch (index) {
				5869	case MSR_IA32_MCG_EXT_CTL:
				5870	return false;
				5871	default:
				5872	break;
				5873	}
				5874
				5875	return true;
				5876	}
				5877
				5878	static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
				5879	{
				5880	return 0;
				5881	}
				5882
				5883	static void svm_cpuid_update(struct kvm_vcpu *vcpu)
				5884	{
				5885	struct vcpu_svm *svm = to_svm(vcpu);
				5886
				5887	/* Update nrips enabled cache */
				5888	svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
				5889
				5890	if (!kvm_vcpu_apicv_active(vcpu))
				5891	return;
				5892
				5893	guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
				5894	}
				5895
				5896	static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
				5897	{
				5898	switch (func) {
				5899	case 0x1:
				5900	if (avic)
				5901	entry->ecx &= ~bit(X86_FEATURE_X2APIC);
				5902	break;
				5903	case 0x80000001:
				5904	if (nested)
				5905	entry->ecx \|= (1 << 2); /* Set SVM bit */
				5906	break;
				5907	case 0x8000000A:
				5908	entry->eax = 1; /* SVM revision 1 */
				5909	entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
				5910	ASID emulation to nested SVM */
				5911	entry->ecx = 0; /* Reserved */
				5912	entry->edx = 0; /* Per default do not support any
				5913	additional features */
				5914
				5915	/* Support next_rip if host supports it */
				5916	if (boot_cpu_has(X86_FEATURE_NRIPS))
				5917	entry->edx \|= SVM_FEATURE_NRIP;
				5918
				5919	/* Support NPT for the guest if enabled */
				5920	if (npt_enabled)
				5921	entry->edx \|= SVM_FEATURE_NPT;
				5922
				5923	break;
				5924	case 0x8000001F:
				5925	/* Support memory encryption cpuid if host supports it */
				5926	if (boot_cpu_has(X86_FEATURE_SEV))
				5927	cpuid(0x8000001f, &entry->eax, &entry->ebx,
				5928	&entry->ecx, &entry->edx);
				5929
				5930	}
				5931	}
				5932
				5933	static int svm_get_lpage_level(void)
				5934	{
				5935	return PT_PDPE_LEVEL;
				5936	}
				5937
				5938	static bool svm_rdtscp_supported(void)
				5939	{
				5940	return boot_cpu_has(X86_FEATURE_RDTSCP);
				5941	}
				5942
				5943	static bool svm_invpcid_supported(void)
				5944	{
				5945	return false;
				5946	}
				5947
				5948	static bool svm_mpx_supported(void)
				5949	{
				5950	return false;
				5951	}
				5952
				5953	static bool svm_xsaves_supported(void)
				5954	{
				5955	return false;
				5956	}
				5957
				5958	static bool svm_umip_emulated(void)
				5959	{
				5960	return false;
				5961	}
				5962
				5963	static bool svm_has_wbinvd_exit(void)
				5964	{
				5965	return true;
				5966	}
				5967
				5968	#define PRE_EX(exit) { .exit_code = (exit), \
				5969	.stage = X86_ICPT_PRE_EXCEPT, }
				5970	#define POST_EX(exit) { .exit_code = (exit), \
				5971	.stage = X86_ICPT_POST_EXCEPT, }
				5972	#define POST_MEM(exit) { .exit_code = (exit), \
				5973	.stage = X86_ICPT_POST_MEMACCESS, }
				5974
				5975	static const struct __x86_intercept {
				5976	u32 exit_code;
				5977	enum x86_intercept_stage stage;
				5978	} x86_intercept_map[] = {
				5979	[x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
				5980	[x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
				5981	[x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
				5982	[x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
				5983	[x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
				5984	[x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
				5985	[x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
				5986	[x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
				5987	[x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
				5988	[x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
				5989	[x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
				5990	[x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
				5991	[x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
				5992	[x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
				5993	[x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
				5994	[x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
				5995	[x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
				5996	[x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
				5997	[x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
				5998	[x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
				5999	[x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
				6000	[x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
				6001	[x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
				6002	[x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
				6003	[x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
				6004	[x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
				6005	[x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
				6006	[x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
				6007	[x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
				6008	[x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
				6009	[x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
				6010	[x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
				6011	[x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
				6012	[x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
				6013	[x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
				6014	[x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
				6015	[x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
				6016	[x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
				6017	[x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
				6018	[x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
				6019	[x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
				6020	[x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
				6021	[x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
				6022	[x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
				6023	[x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
				6024	[x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
				6025	};
				6026
				6027	#undef PRE_EX
				6028	#undef POST_EX
				6029	#undef POST_MEM
				6030
				6031	static int svm_check_intercept(struct kvm_vcpu *vcpu,
				6032	struct x86_instruction_info *info,
				6033	enum x86_intercept_stage stage)
				6034	{
				6035	struct vcpu_svm *svm = to_svm(vcpu);
				6036	int vmexit, ret = X86EMUL_CONTINUE;
				6037	struct __x86_intercept icpt_info;
				6038	struct vmcb *vmcb = svm->vmcb;
				6039
				6040	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
				6041	goto out;
				6042
				6043	icpt_info = x86_intercept_map[info->intercept];
				6044
				6045	if (stage != icpt_info.stage)
				6046	goto out;
				6047
				6048	switch (icpt_info.exit_code) {
				6049	case SVM_EXIT_READ_CR0:
				6050	if (info->intercept == x86_intercept_cr_read)
				6051	icpt_info.exit_code += info->modrm_reg;
				6052	break;
				6053	case SVM_EXIT_WRITE_CR0: {
				6054	unsigned long cr0, val;
				6055	u64 intercept;
				6056
				6057	if (info->intercept == x86_intercept_cr_write)
				6058	icpt_info.exit_code += info->modrm_reg;
				6059
				6060	if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 \|\|
				6061	info->intercept == x86_intercept_clts)
				6062	break;
				6063
				6064	intercept = svm->nested.intercept;
				6065
				6066	if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
				6067	break;
				6068
				6069	cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
				6070	val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
				6071
				6072	if (info->intercept == x86_intercept_lmsw) {
				6073	cr0 &= 0xfUL;
				6074	val &= 0xfUL;
				6075	/* lmsw can't clear PE - catch this here */
				6076	if (cr0 & X86_CR0_PE)
				6077	val \|= X86_CR0_PE;
				6078	}
				6079
				6080	if (cr0 ^ val)
				6081	icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
				6082
				6083	break;
				6084	}
				6085	case SVM_EXIT_READ_DR0:
				6086	case SVM_EXIT_WRITE_DR0:
				6087	icpt_info.exit_code += info->modrm_reg;
				6088	break;
				6089	case SVM_EXIT_MSR:
				6090	if (info->intercept == x86_intercept_wrmsr)
				6091	vmcb->control.exit_info_1 = 1;
				6092	else
				6093	vmcb->control.exit_info_1 = 0;
				6094	break;
				6095	case SVM_EXIT_PAUSE:
				6096	/*
				6097	* We get this for NOP only, but pause
				6098	* is rep not, check this here
				6099	*/
				6100	if (info->rep_prefix != REPE_PREFIX)
				6101	goto out;
				6102	break;
				6103	case SVM_EXIT_IOIO: {
				6104	u64 exit_info;
				6105	u32 bytes;
				6106
				6107	if (info->intercept == x86_intercept_in \|\|
				6108	info->intercept == x86_intercept_ins) {
				6109	exit_info = ((info->src_val & 0xffff) << 16) \|
				6110	SVM_IOIO_TYPE_MASK;
				6111	bytes = info->dst_bytes;
				6112	} else {
				6113	exit_info = (info->dst_val & 0xffff) << 16;
				6114	bytes = info->src_bytes;
				6115	}
				6116
				6117	if (info->intercept == x86_intercept_outs \|\|
				6118	info->intercept == x86_intercept_ins)
				6119	exit_info \|= SVM_IOIO_STR_MASK;
				6120
				6121	if (info->rep_prefix)
				6122	exit_info \|= SVM_IOIO_REP_MASK;
				6123
				6124	bytes = min(bytes, 4u);
				6125
				6126	exit_info \|= bytes << SVM_IOIO_SIZE_SHIFT;
				6127
				6128	exit_info \|= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
				6129
				6130	vmcb->control.exit_info_1 = exit_info;
				6131	vmcb->control.exit_info_2 = info->next_rip;
				6132
				6133	break;
				6134	}
				6135	default:
				6136	break;
				6137	}
				6138
				6139	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
				6140	if (static_cpu_has(X86_FEATURE_NRIPS))
				6141	vmcb->control.next_rip = info->next_rip;
				6142	vmcb->control.exit_code = icpt_info.exit_code;
				6143	vmexit = nested_svm_exit_handled(svm);
				6144
				6145	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
				6146	: X86EMUL_CONTINUE;
				6147
				6148	out:
				6149	return ret;
				6150	}
				6151
				6152	static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
				6153	{
				6154	local_irq_enable();
				6155	/*
				6156	* We must have an instruction with interrupts enabled, so
				6157	* the timer interrupt isn't delayed by the interrupt shadow.
				6158	*/
				6159	asm("nop");
				6160	local_irq_disable();
				6161	}
				6162
				6163	static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
				6164	{
				6165	if (pause_filter_thresh)
				6166	shrink_ple_window(vcpu);
				6167	}
				6168
				6169	static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
				6170	{
				6171	if (avic_handle_apic_id_update(vcpu) != 0)
				6172	return;
				6173	if (avic_handle_dfr_update(vcpu) != 0)
				6174	return;
				6175	avic_handle_ldr_update(vcpu);
				6176	}
				6177
				6178	static void svm_setup_mce(struct kvm_vcpu *vcpu)
				6179	{
				6180	/* [63:9] are reserved. */
				6181	vcpu->arch.mcg_cap &= 0x1ff;
				6182	}
				6183
				6184	static int svm_smi_allowed(struct kvm_vcpu *vcpu)
				6185	{
				6186	struct vcpu_svm *svm = to_svm(vcpu);
				6187
				6188	/* Per APM Vol.2 15.22.2 "Response to SMI" */
				6189	if (!gif_set(svm))
				6190	return 0;
				6191
				6192	if (is_guest_mode(&svm->vcpu) &&
				6193	svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
				6194	/* TODO: Might need to set exit_info_1 and exit_info_2 here */
				6195	svm->vmcb->control.exit_code = SVM_EXIT_SMI;
				6196	svm->nested.exit_required = true;
				6197	return 0;
				6198	}
				6199
				6200	return 1;
				6201	}
				6202
				6203	static int svm_pre_enter_smm(struct kvm_vcpu vcpu, char smstate)
				6204	{
				6205	struct vcpu_svm *svm = to_svm(vcpu);
				6206	int ret;
				6207
				6208	if (is_guest_mode(vcpu)) {
				6209	/* FED8h - SVM Guest */
				6210	put_smstate(u64, smstate, 0x7ed8, 1);
				6211	/* FEE0h - SVM Guest VMCB Physical Address */
				6212	put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
				6213
				6214	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
				6215	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
				6216	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
				6217
				6218	ret = nested_svm_vmexit(svm);
				6219	if (ret)
				6220	return ret;
				6221	}
				6222	return 0;
				6223	}
				6224
				6225	static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
				6226	{
				6227	struct vcpu_svm *svm = to_svm(vcpu);
				6228	struct vmcb *nested_vmcb;
				6229	struct page *page;
				6230	struct {
				6231	u64 guest;
				6232	u64 vmcb;
				6233	} svm_state_save;
				6234	int ret;
				6235
				6236	ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
				6237	sizeof(svm_state_save));
				6238	if (ret)
				6239	return ret;
				6240
				6241	if (svm_state_save.guest) {
				6242	vcpu->arch.hflags &= ~HF_SMM_MASK;
				6243	nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
				6244	if (nested_vmcb)
				6245	enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
				6246	else
				6247	ret = 1;
				6248	vcpu->arch.hflags \|= HF_SMM_MASK;
				6249	}
				6250	return ret;
				6251	}
				6252
				6253	static int enable_smi_window(struct kvm_vcpu *vcpu)
				6254	{
				6255	struct vcpu_svm *svm = to_svm(vcpu);
				6256
				6257	if (!gif_set(svm)) {
				6258	if (vgif_enabled(svm))
				6259	set_intercept(svm, INTERCEPT_STGI);
				6260	/* STGI will cause a vm exit */
				6261	return 1;
				6262	}
				6263	return 0;
				6264	}
				6265
				6266	static int sev_asid_new(void)
				6267	{
				6268	int pos;
				6269
				6270	/*
				6271	* SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
				6272	*/
				6273	pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
				6274	if (pos >= max_sev_asid)
				6275	return -EBUSY;
				6276
				6277	set_bit(pos, sev_asid_bitmap);
				6278	return pos + 1;
				6279	}
				6280
				6281	static int sev_guest_init(struct kvm kvm, struct kvm_sev_cmd argp)
				6282	{
				6283	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6284	int asid, ret;
				6285
				6286	ret = -EBUSY;
				6287	if (unlikely(sev->active))
				6288	return ret;
				6289
				6290	asid = sev_asid_new();
				6291	if (asid < 0)
				6292	return ret;
				6293
				6294	ret = sev_platform_init(&argp->error);
				6295	if (ret)
				6296	goto e_free;
				6297
				6298	sev->active = true;
				6299	sev->asid = asid;
				6300	INIT_LIST_HEAD(&sev->regions_list);
				6301
				6302	return 0;
				6303
				6304	e_free:
				6305	__sev_asid_free(asid);
				6306	return ret;
				6307	}
				6308
				6309	static int sev_bind_asid(struct kvm kvm, unsigned int handle, int error)
				6310	{
				6311	struct sev_data_activate *data;
				6312	int asid = sev_get_asid(kvm);
				6313	int ret;
				6314
				6315	wbinvd_on_all_cpus();
				6316
				6317	ret = sev_guest_df_flush(error);
				6318	if (ret)
				6319	return ret;
				6320
				6321	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6322	if (!data)
				6323	return -ENOMEM;
				6324
				6325	/* activate ASID on the given handle */
				6326	data->handle = handle;
				6327	data->asid = asid;
				6328	ret = sev_guest_activate(data, error);
				6329	kfree(data);
				6330
				6331	return ret;
				6332	}
				6333
				6334	static int __sev_issue_cmd(int fd, int id, void data, int error)
				6335	{
				6336	struct fd f;
				6337	int ret;
				6338
				6339	f = fdget(fd);
				6340	if (!f.file)
				6341	return -EBADF;
				6342
				6343	ret = sev_issue_cmd_external_user(f.file, id, data, error);
				6344
				6345	fdput(f);
				6346	return ret;
				6347	}
				6348
				6349	static int sev_issue_cmd(struct kvm kvm, int id, void data, int *error)
				6350	{
				6351	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6352
				6353	return __sev_issue_cmd(sev->fd, id, data, error);
				6354	}
				6355
				6356	static int sev_launch_start(struct kvm kvm, struct kvm_sev_cmd argp)
				6357	{
				6358	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6359	struct sev_data_launch_start *start;
				6360	struct kvm_sev_launch_start params;
				6361	void dh_blob, session_blob;
				6362	int *error = &argp->error;
				6363	int ret;
				6364
				6365	if (!sev_guest(kvm))
				6366	return -ENOTTY;
				6367
				6368	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
				6369	return -EFAULT;
				6370
				6371	start = kzalloc(sizeof(*start), GFP_KERNEL);
				6372	if (!start)
				6373	return -ENOMEM;
				6374
				6375	dh_blob = NULL;
				6376	if (params.dh_uaddr) {
				6377	dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
				6378	if (IS_ERR(dh_blob)) {
				6379	ret = PTR_ERR(dh_blob);
				6380	goto e_free;
				6381	}
				6382
				6383	start->dh_cert_address = __sme_set(__pa(dh_blob));
				6384	start->dh_cert_len = params.dh_len;
				6385	}
				6386
				6387	session_blob = NULL;
				6388	if (params.session_uaddr) {
				6389	session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
				6390	if (IS_ERR(session_blob)) {
				6391	ret = PTR_ERR(session_blob);
				6392	goto e_free_dh;
				6393	}
				6394
				6395	start->session_address = __sme_set(__pa(session_blob));
				6396	start->session_len = params.session_len;
				6397	}
				6398
				6399	start->handle = params.handle;
				6400	start->policy = params.policy;
				6401
				6402	/* create memory encryption context */
				6403	ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
				6404	if (ret)
				6405	goto e_free_session;
				6406
				6407	/* Bind ASID to this guest */
				6408	ret = sev_bind_asid(kvm, start->handle, error);
				6409	if (ret)
				6410	goto e_free_session;
				6411
				6412	/* return handle to userspace */
				6413	params.handle = start->handle;
				6414	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
				6415	sev_unbind_asid(kvm, start->handle);
				6416	ret = -EFAULT;
				6417	goto e_free_session;
				6418	}
				6419
				6420	sev->handle = start->handle;
				6421	sev->fd = argp->sev_fd;
				6422
				6423	e_free_session:
				6424	kfree(session_blob);
				6425	e_free_dh:
				6426	kfree(dh_blob);
				6427	e_free:
				6428	kfree(start);
				6429	return ret;
				6430	}
				6431
				6432	static unsigned long get_num_contig_pages(unsigned long idx,
				6433	struct page **inpages, unsigned long npages)
				6434	{
				6435	unsigned long paddr, next_paddr;
				6436	unsigned long i = idx + 1, pages = 1;
				6437
				6438	/* find the number of contiguous pages starting from idx */
				6439	paddr = __sme_page_pa(inpages[idx]);
				6440	while (i < npages) {
				6441	next_paddr = __sme_page_pa(inpages[i++]);
				6442	if ((paddr + PAGE_SIZE) == next_paddr) {
				6443	pages++;
				6444	paddr = next_paddr;
				6445	continue;
				6446	}
				6447	break;
				6448	}
				6449
				6450	return pages;
				6451	}
				6452
				6453	static int sev_launch_update_data(struct kvm kvm, struct kvm_sev_cmd argp)
				6454	{
				6455	unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
				6456	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6457	struct kvm_sev_launch_update_data params;
				6458	struct sev_data_launch_update_data *data;
				6459	struct page **inpages;
				6460	int ret;
				6461
				6462	if (!sev_guest(kvm))
				6463	return -ENOTTY;
				6464
				6465	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
				6466	return -EFAULT;
				6467
				6468	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6469	if (!data)
				6470	return -ENOMEM;
				6471
				6472	vaddr = params.uaddr;
				6473	size = params.len;
				6474	vaddr_end = vaddr + size;
				6475
				6476	/* Lock the user memory. */
				6477	inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
				6478	if (!inpages) {
				6479	ret = -ENOMEM;
				6480	goto e_free;
				6481	}
				6482
				6483	/*
				6484	* The LAUNCH_UPDATE command will perform in-place encryption of the
				6485	* memory content (i.e it will write the same memory region with C=1).
				6486	* It's possible that the cache may contain the data with C=0, i.e.,
				6487	* unencrypted so invalidate it first.
				6488	*/
				6489	sev_clflush_pages(inpages, npages);
				6490
				6491	for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
				6492	int offset, len;
				6493
				6494	/*
				6495	* If the user buffer is not page-aligned, calculate the offset
				6496	* within the page.
				6497	*/
				6498	offset = vaddr & (PAGE_SIZE - 1);
				6499
				6500	/* Calculate the number of pages that can be encrypted in one go. */
				6501	pages = get_num_contig_pages(i, inpages, npages);
				6502
				6503	len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
				6504
				6505	data->handle = sev->handle;
				6506	data->len = len;
				6507	data->address = __sme_page_pa(inpages[i]) + offset;
				6508	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
				6509	if (ret)
				6510	goto e_unpin;
				6511
				6512	size -= len;
				6513	next_vaddr = vaddr + len;
				6514	}
				6515
				6516	e_unpin:
				6517	/* content of memory is updated, mark pages dirty */
				6518	for (i = 0; i < npages; i++) {
				6519	set_page_dirty_lock(inpages[i]);
				6520	mark_page_accessed(inpages[i]);
				6521	}
				6522	/* unlock the user pages */
				6523	sev_unpin_memory(kvm, inpages, npages);
				6524	e_free:
				6525	kfree(data);
				6526	return ret;
				6527	}
				6528
				6529	static int sev_launch_measure(struct kvm kvm, struct kvm_sev_cmd argp)
				6530	{
				6531	void __user measure = (void __user )(uintptr_t)argp->data;
				6532	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6533	struct sev_data_launch_measure *data;
				6534	struct kvm_sev_launch_measure params;
				6535	void __user *p = NULL;
				6536	void *blob = NULL;
				6537	int ret;
				6538
				6539	if (!sev_guest(kvm))
				6540	return -ENOTTY;
				6541
				6542	if (copy_from_user(&params, measure, sizeof(params)))
				6543	return -EFAULT;
				6544
				6545	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6546	if (!data)
				6547	return -ENOMEM;
				6548
				6549	/* User wants to query the blob length */
				6550	if (!params.len)
				6551	goto cmd;
				6552
				6553	p = (void __user *)(uintptr_t)params.uaddr;
				6554	if (p) {
				6555	if (params.len > SEV_FW_BLOB_MAX_SIZE) {
				6556	ret = -EINVAL;
				6557	goto e_free;
				6558	}
				6559
				6560	ret = -ENOMEM;
				6561	blob = kmalloc(params.len, GFP_KERNEL);
				6562	if (!blob)
				6563	goto e_free;
				6564
				6565	data->address = __psp_pa(blob);
				6566	data->len = params.len;
				6567	}
				6568
				6569	cmd:
				6570	data->handle = sev->handle;
				6571	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
				6572
				6573	/*
				6574	* If we query the session length, FW responded with expected data.
				6575	*/
				6576	if (!params.len)
				6577	goto done;
				6578
				6579	if (ret)
				6580	goto e_free_blob;
				6581
				6582	if (blob) {
				6583	if (copy_to_user(p, blob, params.len))
				6584	ret = -EFAULT;
				6585	}
				6586
				6587	done:
				6588	params.len = data->len;
				6589	if (copy_to_user(measure, &params, sizeof(params)))
				6590	ret = -EFAULT;
				6591	e_free_blob:
				6592	kfree(blob);
				6593	e_free:
				6594	kfree(data);
				6595	return ret;
				6596	}
				6597
				6598	static int sev_launch_finish(struct kvm kvm, struct kvm_sev_cmd argp)
				6599	{
				6600	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6601	struct sev_data_launch_finish *data;
				6602	int ret;
				6603
				6604	if (!sev_guest(kvm))
				6605	return -ENOTTY;
				6606
				6607	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6608	if (!data)
				6609	return -ENOMEM;
				6610
				6611	data->handle = sev->handle;
				6612	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
				6613
				6614	kfree(data);
				6615	return ret;
				6616	}
				6617
				6618	static int sev_guest_status(struct kvm kvm, struct kvm_sev_cmd argp)
				6619	{
				6620	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6621	struct kvm_sev_guest_status params;
				6622	struct sev_data_guest_status *data;
				6623	int ret;
				6624
				6625	if (!sev_guest(kvm))
				6626	return -ENOTTY;
				6627
				6628	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6629	if (!data)
				6630	return -ENOMEM;
				6631
				6632	data->handle = sev->handle;
				6633	ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
				6634	if (ret)
				6635	goto e_free;
				6636
				6637	params.policy = data->policy;
				6638	params.state = data->state;
				6639	params.handle = data->handle;
				6640
				6641	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
				6642	ret = -EFAULT;
				6643	e_free:
				6644	kfree(data);
				6645	return ret;
				6646	}
				6647
				6648	static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
				6649	unsigned long dst, int size,
				6650	int *error, bool enc)
				6651	{
				6652	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6653	struct sev_data_dbg *data;
				6654	int ret;
				6655
				6656	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6657	if (!data)
				6658	return -ENOMEM;
				6659
				6660	data->handle = sev->handle;
				6661	data->dst_addr = dst;
				6662	data->src_addr = src;
				6663	data->len = size;
				6664
				6665	ret = sev_issue_cmd(kvm,
				6666	enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
				6667	data, error);
				6668	kfree(data);
				6669	return ret;
				6670	}
				6671
				6672	static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
				6673	unsigned long dst_paddr, int sz, int *err)
				6674	{
				6675	int offset;
				6676
				6677	/*
				6678	* Its safe to read more than we are asked, caller should ensure that
				6679	* destination has enough space.
				6680	*/
				6681	src_paddr = round_down(src_paddr, 16);
				6682	offset = src_paddr & 15;
				6683	sz = round_up(sz + offset, 16);
				6684
				6685	return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
				6686	}
				6687
				6688	static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
				6689	unsigned long __user dst_uaddr,
				6690	unsigned long dst_paddr,
				6691	int size, int *err)
				6692	{
				6693	struct page *tpage = NULL;
				6694	int ret, offset;
				6695
				6696	/* if inputs are not 16-byte then use intermediate buffer */
				6697	if (!IS_ALIGNED(dst_paddr, 16) \|\|
				6698	!IS_ALIGNED(paddr, 16) \|\|
				6699	!IS_ALIGNED(size, 16)) {
				6700	tpage = (void *)alloc_page(GFP_KERNEL);
				6701	if (!tpage)
				6702	return -ENOMEM;
				6703
				6704	dst_paddr = __sme_page_pa(tpage);
				6705	}
				6706
				6707	ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
				6708	if (ret)
				6709	goto e_free;
				6710
				6711	if (tpage) {
				6712	offset = paddr & 15;
				6713	if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
				6714	page_address(tpage) + offset, size))
				6715	ret = -EFAULT;
				6716	}
				6717
				6718	e_free:
				6719	if (tpage)
				6720	__free_page(tpage);
				6721
				6722	return ret;
				6723	}
				6724
				6725	static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
				6726	unsigned long __user vaddr,
				6727	unsigned long dst_paddr,
				6728	unsigned long __user dst_vaddr,
				6729	int size, int *error)
				6730	{
				6731	struct page *src_tpage = NULL;
				6732	struct page *dst_tpage = NULL;
				6733	int ret, len = size;
				6734
				6735	/* If source buffer is not aligned then use an intermediate buffer */
				6736	if (!IS_ALIGNED(vaddr, 16)) {
				6737	src_tpage = alloc_page(GFP_KERNEL);
				6738	if (!src_tpage)
				6739	return -ENOMEM;
				6740
				6741	if (copy_from_user(page_address(src_tpage),
				6742	(void __user *)(uintptr_t)vaddr, size)) {
				6743	__free_page(src_tpage);
				6744	return -EFAULT;
				6745	}
				6746
				6747	paddr = __sme_page_pa(src_tpage);
				6748	}
				6749
				6750	/*
				6751	* If destination buffer or length is not aligned then do read-modify-write:
				6752	* - decrypt destination in an intermediate buffer
				6753	* - copy the source buffer in an intermediate buffer
				6754	* - use the intermediate buffer as source buffer
				6755	*/
				6756	if (!IS_ALIGNED(dst_vaddr, 16) \|\| !IS_ALIGNED(size, 16)) {
				6757	int dst_offset;
				6758
				6759	dst_tpage = alloc_page(GFP_KERNEL);
				6760	if (!dst_tpage) {
				6761	ret = -ENOMEM;
				6762	goto e_free;
				6763	}
				6764
				6765	ret = __sev_dbg_decrypt(kvm, dst_paddr,
				6766	__sme_page_pa(dst_tpage), size, error);
				6767	if (ret)
				6768	goto e_free;
				6769
				6770	/*
				6771	* If source is kernel buffer then use memcpy() otherwise
				6772	* copy_from_user().
				6773	*/
				6774	dst_offset = dst_paddr & 15;
				6775
				6776	if (src_tpage)
				6777	memcpy(page_address(dst_tpage) + dst_offset,
				6778	page_address(src_tpage), size);
				6779	else {
				6780	if (copy_from_user(page_address(dst_tpage) + dst_offset,
				6781	(void __user *)(uintptr_t)vaddr, size)) {
				6782	ret = -EFAULT;
				6783	goto e_free;
				6784	}
				6785	}
				6786
				6787	paddr = __sme_page_pa(dst_tpage);
				6788	dst_paddr = round_down(dst_paddr, 16);
				6789	len = round_up(size, 16);
				6790	}
				6791
				6792	ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
				6793
				6794	e_free:
				6795	if (src_tpage)
				6796	__free_page(src_tpage);
				6797	if (dst_tpage)
				6798	__free_page(dst_tpage);
				6799	return ret;
				6800	}
				6801
				6802	static int sev_dbg_crypt(struct kvm kvm, struct kvm_sev_cmd argp, bool dec)
				6803	{
				6804	unsigned long vaddr, vaddr_end, next_vaddr;
				6805	unsigned long dst_vaddr;
				6806	struct page src_p, dst_p;
				6807	struct kvm_sev_dbg debug;
				6808	unsigned long n;
				6809	unsigned int size;
				6810	int ret;
				6811
				6812	if (!sev_guest(kvm))
				6813	return -ENOTTY;
				6814
				6815	if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
				6816	return -EFAULT;
				6817
				6818	if (!debug.len \|\| debug.src_uaddr + debug.len < debug.src_uaddr)
				6819	return -EINVAL;
				6820	if (!debug.dst_uaddr)
				6821	return -EINVAL;
				6822
				6823	vaddr = debug.src_uaddr;
				6824	size = debug.len;
				6825	vaddr_end = vaddr + size;
				6826	dst_vaddr = debug.dst_uaddr;
				6827
				6828	for (; vaddr < vaddr_end; vaddr = next_vaddr) {
				6829	int len, s_off, d_off;
				6830
				6831	/* lock userspace source and destination page */
				6832	src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
				6833	if (!src_p)
				6834	return -EFAULT;
				6835
				6836	dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
				6837	if (!dst_p) {
				6838	sev_unpin_memory(kvm, src_p, n);
				6839	return -EFAULT;
				6840	}
				6841
				6842	/*
				6843	* The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
				6844	* memory content (i.e it will write the same memory region with C=1).
				6845	* It's possible that the cache may contain the data with C=0, i.e.,
				6846	* unencrypted so invalidate it first.
				6847	*/
				6848	sev_clflush_pages(src_p, 1);
				6849	sev_clflush_pages(dst_p, 1);
				6850
				6851	/*
				6852	* Since user buffer may not be page aligned, calculate the
				6853	* offset within the page.
				6854	*/
				6855	s_off = vaddr & ~PAGE_MASK;
				6856	d_off = dst_vaddr & ~PAGE_MASK;
				6857	len = min_t(size_t, (PAGE_SIZE - s_off), size);
				6858
				6859	if (dec)
				6860	ret = __sev_dbg_decrypt_user(kvm,
				6861	__sme_page_pa(src_p[0]) + s_off,
				6862	dst_vaddr,
				6863	__sme_page_pa(dst_p[0]) + d_off,
				6864	len, &argp->error);
				6865	else
				6866	ret = __sev_dbg_encrypt_user(kvm,
				6867	__sme_page_pa(src_p[0]) + s_off,
				6868	vaddr,
				6869	__sme_page_pa(dst_p[0]) + d_off,
				6870	dst_vaddr,
				6871	len, &argp->error);
				6872
				6873	sev_unpin_memory(kvm, src_p, n);
				6874	sev_unpin_memory(kvm, dst_p, n);
				6875
				6876	if (ret)
				6877	goto err;
				6878
				6879	next_vaddr = vaddr + len;
				6880	dst_vaddr = dst_vaddr + len;
				6881	size -= len;
				6882	}
				6883	err:
				6884	return ret;
				6885	}
				6886
				6887	static int sev_launch_secret(struct kvm kvm, struct kvm_sev_cmd argp)
				6888	{
				6889	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				6890	struct sev_data_launch_secret *data;
				6891	struct kvm_sev_launch_secret params;
				6892	struct page **pages;
				6893	void blob, hdr;
				6894	unsigned long n;
				6895	int ret, offset;
				6896
				6897	if (!sev_guest(kvm))
				6898	return -ENOTTY;
				6899
				6900	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
				6901	return -EFAULT;
				6902
				6903	pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
				6904	if (!pages)
				6905	return -ENOMEM;
				6906
				6907	/*
				6908	* The secret must be copied into contiguous memory region, lets verify
				6909	* that userspace memory pages are contiguous before we issue command.
				6910	*/
				6911	if (get_num_contig_pages(0, pages, n) != n) {
				6912	ret = -EINVAL;
				6913	goto e_unpin_memory;
				6914	}
				6915
				6916	ret = -ENOMEM;
				6917	data = kzalloc(sizeof(*data), GFP_KERNEL);
				6918	if (!data)
				6919	goto e_unpin_memory;
				6920
				6921	offset = params.guest_uaddr & (PAGE_SIZE - 1);
				6922	data->guest_address = __sme_page_pa(pages[0]) + offset;
				6923	data->guest_len = params.guest_len;
				6924
				6925	blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
				6926	if (IS_ERR(blob)) {
				6927	ret = PTR_ERR(blob);
				6928	goto e_free;
				6929	}
				6930
				6931	data->trans_address = __psp_pa(blob);
				6932	data->trans_len = params.trans_len;
				6933
				6934	hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
				6935	if (IS_ERR(hdr)) {
				6936	ret = PTR_ERR(hdr);
				6937	goto e_free_blob;
				6938	}
				6939	data->hdr_address = __psp_pa(hdr);
				6940	data->hdr_len = params.hdr_len;
				6941
				6942	data->handle = sev->handle;
				6943	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
				6944
				6945	kfree(hdr);
				6946
				6947	e_free_blob:
				6948	kfree(blob);
				6949	e_free:
				6950	kfree(data);
				6951	e_unpin_memory:
				6952	sev_unpin_memory(kvm, pages, n);
				6953	return ret;
				6954	}
				6955
				6956	static int svm_mem_enc_op(struct kvm kvm, void __user argp)
				6957	{
				6958	struct kvm_sev_cmd sev_cmd;
				6959	int r;
				6960
				6961	if (!svm_sev_enabled())
				6962	return -ENOTTY;
				6963
				6964	if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
				6965	return -EFAULT;
				6966
				6967	mutex_lock(&kvm->lock);
				6968
				6969	switch (sev_cmd.id) {
				6970	case KVM_SEV_INIT:
				6971	r = sev_guest_init(kvm, &sev_cmd);
				6972	break;
				6973	case KVM_SEV_LAUNCH_START:
				6974	r = sev_launch_start(kvm, &sev_cmd);
				6975	break;
				6976	case KVM_SEV_LAUNCH_UPDATE_DATA:
				6977	r = sev_launch_update_data(kvm, &sev_cmd);
				6978	break;
				6979	case KVM_SEV_LAUNCH_MEASURE:
				6980	r = sev_launch_measure(kvm, &sev_cmd);
				6981	break;
				6982	case KVM_SEV_LAUNCH_FINISH:
				6983	r = sev_launch_finish(kvm, &sev_cmd);
				6984	break;
				6985	case KVM_SEV_GUEST_STATUS:
				6986	r = sev_guest_status(kvm, &sev_cmd);
				6987	break;
				6988	case KVM_SEV_DBG_DECRYPT:
				6989	r = sev_dbg_crypt(kvm, &sev_cmd, true);
				6990	break;
				6991	case KVM_SEV_DBG_ENCRYPT:
				6992	r = sev_dbg_crypt(kvm, &sev_cmd, false);
				6993	break;
				6994	case KVM_SEV_LAUNCH_SECRET:
				6995	r = sev_launch_secret(kvm, &sev_cmd);
				6996	break;
				6997	default:
				6998	r = -EINVAL;
				6999	goto out;
				7000	}
				7001
				7002	if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
				7003	r = -EFAULT;
				7004
				7005	out:
				7006	mutex_unlock(&kvm->lock);
				7007	return r;
				7008	}
				7009
				7010	static int svm_register_enc_region(struct kvm *kvm,
				7011	struct kvm_enc_region *range)
				7012	{
				7013	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				7014	struct enc_region *region;
				7015	int ret = 0;
				7016
				7017	if (!sev_guest(kvm))
				7018	return -ENOTTY;
				7019
				7020	if (range->addr > ULONG_MAX \|\| range->size > ULONG_MAX)
				7021	return -EINVAL;
				7022
				7023	region = kzalloc(sizeof(*region), GFP_KERNEL);
				7024	if (!region)
				7025	return -ENOMEM;
				7026
				7027	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
				7028	if (!region->pages) {
				7029	ret = -ENOMEM;
				7030	goto e_free;
				7031	}
				7032
				7033	/*
				7034	* The guest may change the memory encryption attribute from C=0 -> C=1
				7035	* or vice versa for this memory range. Lets make sure caches are
				7036	* flushed to ensure that guest data gets written into memory with
				7037	* correct C-bit.
				7038	*/
				7039	sev_clflush_pages(region->pages, region->npages);
				7040
				7041	region->uaddr = range->addr;
				7042	region->size = range->size;
				7043
				7044	mutex_lock(&kvm->lock);
				7045	list_add_tail(&region->list, &sev->regions_list);
				7046	mutex_unlock(&kvm->lock);
				7047
				7048	return ret;
				7049
				7050	e_free:
				7051	kfree(region);
				7052	return ret;
				7053	}
				7054
				7055	static struct enc_region *
				7056	find_enc_region(struct kvm kvm, struct kvm_enc_region range)
				7057	{
				7058	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
				7059	struct list_head *head = &sev->regions_list;
				7060	struct enc_region *i;
				7061
				7062	list_for_each_entry(i, head, list) {
				7063	if (i->uaddr == range->addr &&
				7064	i->size == range->size)
				7065	return i;
				7066	}
				7067
				7068	return NULL;
				7069	}
				7070
				7071
				7072	static int svm_unregister_enc_region(struct kvm *kvm,
				7073	struct kvm_enc_region *range)
				7074	{
				7075	struct enc_region *region;
				7076	int ret;
				7077
				7078	mutex_lock(&kvm->lock);
				7079
				7080	if (!sev_guest(kvm)) {
				7081	ret = -ENOTTY;
				7082	goto failed;
				7083	}
				7084
				7085	region = find_enc_region(kvm, range);
				7086	if (!region) {
				7087	ret = -EINVAL;
				7088	goto failed;
				7089	}
				7090
				7091	__unregister_enc_region_locked(kvm, region);
				7092
				7093	mutex_unlock(&kvm->lock);
				7094	return 0;
				7095
				7096	failed:
				7097	mutex_unlock(&kvm->lock);
				7098	return ret;
				7099	}
				7100
				7101	static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
				7102	.cpu_has_kvm_support = has_svm,
				7103	.disabled_by_bios = is_disabled,
				7104	.hardware_setup = svm_hardware_setup,
				7105	.hardware_unsetup = svm_hardware_unsetup,
				7106	.check_processor_compatibility = svm_check_processor_compat,
				7107	.hardware_enable = svm_hardware_enable,
				7108	.hardware_disable = svm_hardware_disable,
				7109	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
				7110	.has_emulated_msr = svm_has_emulated_msr,
				7111
				7112	.vcpu_create = svm_create_vcpu,
				7113	.vcpu_free = svm_free_vcpu,
				7114	.vcpu_reset = svm_vcpu_reset,
				7115
				7116	.vm_alloc = svm_vm_alloc,
				7117	.vm_free = svm_vm_free,
				7118	.vm_init = avic_vm_init,
				7119	.vm_destroy = svm_vm_destroy,
				7120
				7121	.prepare_guest_switch = svm_prepare_guest_switch,
				7122	.vcpu_load = svm_vcpu_load,
				7123	.vcpu_put = svm_vcpu_put,
				7124	.vcpu_blocking = svm_vcpu_blocking,
				7125	.vcpu_unblocking = svm_vcpu_unblocking,
				7126
				7127	.update_bp_intercept = update_bp_intercept,
				7128	.get_msr_feature = svm_get_msr_feature,
				7129	.get_msr = svm_get_msr,
				7130	.set_msr = svm_set_msr,
				7131	.get_segment_base = svm_get_segment_base,
				7132	.get_segment = svm_get_segment,
				7133	.set_segment = svm_set_segment,
				7134	.get_cpl = svm_get_cpl,
				7135	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
				7136	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
				7137	.decache_cr3 = svm_decache_cr3,
				7138	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
				7139	.set_cr0 = svm_set_cr0,
				7140	.set_cr3 = svm_set_cr3,
				7141	.set_cr4 = svm_set_cr4,
				7142	.set_efer = svm_set_efer,
				7143	.get_idt = svm_get_idt,
				7144	.set_idt = svm_set_idt,
				7145	.get_gdt = svm_get_gdt,
				7146	.set_gdt = svm_set_gdt,
				7147	.get_dr6 = svm_get_dr6,
				7148	.set_dr6 = svm_set_dr6,
				7149	.set_dr7 = svm_set_dr7,
				7150	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
				7151	.cache_reg = svm_cache_reg,
				7152	.get_rflags = svm_get_rflags,
				7153	.set_rflags = svm_set_rflags,
				7154
				7155	.tlb_flush = svm_flush_tlb,
				7156	.tlb_flush_gva = svm_flush_tlb_gva,
				7157
				7158	.run = svm_vcpu_run,
				7159	.handle_exit = handle_exit,
				7160	.skip_emulated_instruction = skip_emulated_instruction,
				7161	.set_interrupt_shadow = svm_set_interrupt_shadow,
				7162	.get_interrupt_shadow = svm_get_interrupt_shadow,
				7163	.patch_hypercall = svm_patch_hypercall,
				7164	.set_irq = svm_set_irq,
				7165	.set_nmi = svm_inject_nmi,
				7166	.queue_exception = svm_queue_exception,
				7167	.cancel_injection = svm_cancel_injection,
				7168	.interrupt_allowed = svm_interrupt_allowed,
				7169	.nmi_allowed = svm_nmi_allowed,
				7170	.get_nmi_mask = svm_get_nmi_mask,
				7171	.set_nmi_mask = svm_set_nmi_mask,
				7172	.enable_nmi_window = enable_nmi_window,
				7173	.enable_irq_window = enable_irq_window,
				7174	.update_cr8_intercept = update_cr8_intercept,
				7175	.set_virtual_apic_mode = svm_set_virtual_apic_mode,
				7176	.get_enable_apicv = svm_get_enable_apicv,
				7177	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
				7178	.load_eoi_exitmap = svm_load_eoi_exitmap,
				7179	.hwapic_irr_update = svm_hwapic_irr_update,
				7180	.hwapic_isr_update = svm_hwapic_isr_update,
				7181	.sync_pir_to_irr = kvm_lapic_find_highest_irr,
				7182	.apicv_post_state_restore = avic_post_state_restore,
				7183
				7184	.set_tss_addr = svm_set_tss_addr,
				7185	.set_identity_map_addr = svm_set_identity_map_addr,
				7186	.get_tdp_level = get_npt_level,
				7187	.get_mt_mask = svm_get_mt_mask,
				7188
				7189	.get_exit_info = svm_get_exit_info,
				7190
				7191	.get_lpage_level = svm_get_lpage_level,
				7192
				7193	.cpuid_update = svm_cpuid_update,
				7194
				7195	.rdtscp_supported = svm_rdtscp_supported,
				7196	.invpcid_supported = svm_invpcid_supported,
				7197	.mpx_supported = svm_mpx_supported,
				7198	.xsaves_supported = svm_xsaves_supported,
				7199	.umip_emulated = svm_umip_emulated,
				7200
				7201	.set_supported_cpuid = svm_set_supported_cpuid,
				7202
				7203	.has_wbinvd_exit = svm_has_wbinvd_exit,
				7204
				7205	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
				7206	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
				7207
				7208	.set_tdp_cr3 = set_tdp_cr3,
				7209
				7210	.check_intercept = svm_check_intercept,
				7211	.handle_external_intr = svm_handle_external_intr,
				7212
				7213	.request_immediate_exit = __kvm_request_immediate_exit,
				7214
				7215	.sched_in = svm_sched_in,
				7216
				7217	.pmu_ops = &amd_pmu_ops,
				7218	.deliver_posted_interrupt = svm_deliver_avic_intr,
				7219	.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
				7220	.update_pi_irte = svm_update_pi_irte,
				7221	.setup_mce = svm_setup_mce,
				7222
				7223	.smi_allowed = svm_smi_allowed,
				7224	.pre_enter_smm = svm_pre_enter_smm,
				7225	.pre_leave_smm = svm_pre_leave_smm,
				7226	.enable_smi_window = enable_smi_window,
				7227
				7228	.mem_enc_op = svm_mem_enc_op,
				7229	.mem_enc_reg_region = svm_register_enc_region,
				7230	.mem_enc_unreg_region = svm_unregister_enc_region,
				7231	};
				7232
				7233	static int __init svm_init(void)
				7234	{
				7235	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
				7236	__alignof__(struct vcpu_svm), THIS_MODULE);
				7237	}
				7238
				7239	static void __exit svm_exit(void)
				7240	{
				7241	kvm_exit();
				7242	}
				7243
				7244	module_init(svm_init)
				7245	module_exit(svm_exit)