Blame - src/kernel/linux/v4.14/arch/x86/kvm/mmu.c - T103

blob: 1cceee0ed580ded5f1aece42bd8445ef5a6ef2b3 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* This module enables machines with Intel VT-x extensions to run virtual
				5	* machines without emulation or binary translation.
				6	*
				7	* MMU support
				8	*
				9	* Copyright (C) 2006 Qumranet, Inc.
				10	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				11	*
				12	* Authors:
				13	* Yaniv Kamay <yaniv@qumranet.com>
				14	* Avi Kivity <avi@qumranet.com>
				15	*
				16	* This work is licensed under the terms of the GNU GPL, version 2. See
				17	* the COPYING file in the top-level directory.
				18	*
				19	*/
				20
				21	#include "irq.h"
				22	#include "mmu.h"
				23	#include "x86.h"
				24	#include "kvm_cache_regs.h"
				25	#include "cpuid.h"
				26
				27	#include <linux/kvm_host.h>
				28	#include <linux/types.h>
				29	#include <linux/string.h>
				30	#include <linux/mm.h>
				31	#include <linux/highmem.h>
				32	#include <linux/moduleparam.h>
				33	#include <linux/export.h>
				34	#include <linux/swap.h>
				35	#include <linux/hugetlb.h>
				36	#include <linux/compiler.h>
				37	#include <linux/srcu.h>
				38	#include <linux/slab.h>
				39	#include <linux/sched/signal.h>
				40	#include <linux/uaccess.h>
				41	#include <linux/hash.h>
				42	#include <linux/kern_levels.h>
				43	#include <linux/kthread.h>
				44
				45	#include <asm/page.h>
				46	#include <asm/cmpxchg.h>
				47	#include <asm/io.h>
				48	#include <asm/vmx.h>
				49	#include <asm/kvm_page_track.h>
				50	#include "trace.h"
				51
				52	extern bool itlb_multihit_kvm_mitigation;
				53
				54	static int __read_mostly nx_huge_pages = -1;
				55	static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
				56
				57	static int set_nx_huge_pages(const char val, const struct kernel_param kp);
				58	static int set_nx_huge_pages_recovery_ratio(const char val, const struct kernel_param kp);
				59
				60	static struct kernel_param_ops nx_huge_pages_ops = {
				61	.set = set_nx_huge_pages,
				62	.get = param_get_bool,
				63	};
				64
				65	static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
				66	.set = set_nx_huge_pages_recovery_ratio,
				67	.get = param_get_uint,
				68	};
				69
				70	module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
				71	__MODULE_PARM_TYPE(nx_huge_pages, "bool");
				72	module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
				73	&nx_huge_pages_recovery_ratio, 0644);
				74	__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
				75
				76	/*
				77	* When setting this variable to true it enables Two-Dimensional-Paging
				78	* where the hardware walks 2 page tables:
				79	* 1. the guest-virtual to guest-physical
				80	* 2. while doing 1. it walks guest-physical to host-physical
				81	* If the hardware supports that we don't need to do shadow paging.
				82	*/
				83	bool tdp_enabled = false;
				84
				85	enum {
				86	AUDIT_PRE_PAGE_FAULT,
				87	AUDIT_POST_PAGE_FAULT,
				88	AUDIT_PRE_PTE_WRITE,
				89	AUDIT_POST_PTE_WRITE,
				90	AUDIT_PRE_SYNC,
				91	AUDIT_POST_SYNC
				92	};
				93
				94	#undef MMU_DEBUG
				95
				96	#ifdef MMU_DEBUG
				97	static bool dbg = 0;
				98	module_param(dbg, bool, 0644);
				99
				100	#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
				101	#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
				102	#define MMU_WARN_ON(x) WARN_ON(x)
				103	#else
				104	#define pgprintk(x...) do { } while (0)
				105	#define rmap_printk(x...) do { } while (0)
				106	#define MMU_WARN_ON(x) do { } while (0)
				107	#endif
				108
				109	#define PTE_PREFETCH_NUM 8
				110
				111	#define PT_FIRST_AVAIL_BITS_SHIFT 10
				112	#define PT64_SECOND_AVAIL_BITS_SHIFT 52
				113
				114	#define PT64_LEVEL_BITS 9
				115
				116	#define PT64_LEVEL_SHIFT(level) \
				117	(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
				118
				119	#define PT64_INDEX(address, level)\
				120	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
				121
				122
				123	#define PT32_LEVEL_BITS 10
				124
				125	#define PT32_LEVEL_SHIFT(level) \
				126	(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
				127
				128	#define PT32_LVL_OFFSET_MASK(level) \
				129	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
				130	* PT32_LEVEL_BITS))) - 1))
				131
				132	#define PT32_INDEX(address, level)\
				133	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
				134
				135
				136	#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
				137	#define PT64_DIR_BASE_ADDR_MASK \
				138	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
				139	#define PT64_LVL_ADDR_MASK(level) \
				140	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
				141	* PT64_LEVEL_BITS))) - 1))
				142	#define PT64_LVL_OFFSET_MASK(level) \
				143	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
				144	* PT64_LEVEL_BITS))) - 1))
				145
				146	#define PT32_BASE_ADDR_MASK PAGE_MASK
				147	#define PT32_DIR_BASE_ADDR_MASK \
				148	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
				149	#define PT32_LVL_ADDR_MASK(level) \
				150	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
				151	* PT32_LEVEL_BITS))) - 1))
				152
				153	#define PT64_PERM_MASK (PT_PRESENT_MASK \| PT_WRITABLE_MASK \| shadow_user_mask \
				154	\| shadow_x_mask \| shadow_nx_mask \| shadow_me_mask)
				155
				156	#define ACC_EXEC_MASK 1
				157	#define ACC_WRITE_MASK PT_WRITABLE_MASK
				158	#define ACC_USER_MASK PT_USER_MASK
				159	#define ACC_ALL (ACC_EXEC_MASK \| ACC_WRITE_MASK \| ACC_USER_MASK)
				160
				161	/* The mask for the R/X bits in EPT PTEs */
				162	#define PT64_EPT_READABLE_MASK 0x1ull
				163	#define PT64_EPT_EXECUTABLE_MASK 0x4ull
				164
				165	#include <trace/events/kvm.h>
				166
				167	#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
				168	#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
				169
				170	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
				171
				172	/* make pte_list_desc fit well in cache line */
				173	#define PTE_LIST_EXT 3
				174
				175	/*
				176	* Return values of handle_mmio_page_fault and mmu.page_fault:
				177	* RET_PF_RETRY: let CPU fault again on the address.
				178	* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
				179	*
				180	* For handle_mmio_page_fault only:
				181	* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
				182	*/
				183	enum {
				184	RET_PF_RETRY = 0,
				185	RET_PF_EMULATE = 1,
				186	RET_PF_INVALID = 2,
				187	};
				188
				189	struct pte_list_desc {
				190	u64 *sptes[PTE_LIST_EXT];
				191	struct pte_list_desc *more;
				192	};
				193
				194	struct kvm_shadow_walk_iterator {
				195	u64 addr;
				196	hpa_t shadow_addr;
				197	u64 *sptep;
				198	int level;
				199	unsigned index;
				200	};
				201
				202	#define for_each_shadow_entry(_vcpu, _addr, _walker) \
				203	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
				204	shadow_walk_okay(&(_walker)); \
				205	shadow_walk_next(&(_walker)))
				206
				207	#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
				208	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
				209	shadow_walk_okay(&(_walker)) && \
				210	({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
				211	__shadow_walk_next(&(_walker), spte))
				212
				213	static struct kmem_cache *pte_list_desc_cache;
				214	static struct kmem_cache *mmu_page_header_cache;
				215	static struct percpu_counter kvm_total_used_mmu_pages;
				216
				217	static u64 __read_mostly shadow_nx_mask;
				218	static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
				219	static u64 __read_mostly shadow_user_mask;
				220	static u64 __read_mostly shadow_accessed_mask;
				221	static u64 __read_mostly shadow_dirty_mask;
				222	static u64 __read_mostly shadow_mmio_mask;
				223	static u64 __read_mostly shadow_mmio_value;
				224	static u64 __read_mostly shadow_present_mask;
				225	static u64 __read_mostly shadow_me_mask;
				226
				227	/*
				228	* SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
				229	* Non-present SPTEs with shadow_acc_track_value set are in place for access
				230	* tracking.
				231	*/
				232	static u64 __read_mostly shadow_acc_track_mask;
				233	static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
				234
				235	/*
				236	* The mask/shift to use for saving the original R/X bits when marking the PTE
				237	* as not-present for access tracking purposes. We do not save the W bit as the
				238	* PTEs being access tracked also need to be dirty tracked, so the W bit will be
				239	* restored only when a write is attempted to the page.
				240	*/
				241	static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK \|
				242	PT64_EPT_EXECUTABLE_MASK;
				243	static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
				244
				245	/*
				246	* This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
				247	* to guard against L1TF attacks.
				248	*/
				249	static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
				250
				251	/*
				252	* The number of high-order 1 bits to use in the mask above.
				253	*/
				254	static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
				255
				256	/*
				257	* In some cases, we need to preserve the GFN of a non-present or reserved
				258	* SPTE when we usurp the upper five bits of the physical address space to
				259	* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
				260	* shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
				261	* left into the reserved bits, i.e. the GFN in the SPTE will be split into
				262	* high and low parts. This mask covers the lower bits of the GFN.
				263	*/
				264	static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
				265
				266	/*
				267	* The number of non-reserved physical address bits irrespective of features
				268	* that repurpose legal bits, e.g. MKTME.
				269	*/
				270	static u8 __read_mostly shadow_phys_bits;
				271
				272	static void mmu_spte_set(u64 *sptep, u64 spte);
				273	static void mmu_free_roots(struct kvm_vcpu *vcpu);
				274	static bool is_executable_pte(u64 spte);
				275
				276	#define CREATE_TRACE_POINTS
				277	#include "mmutrace.h"
				278
				279
				280	void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
				281	{
				282	BUG_ON((mmio_mask & mmio_value) != mmio_value);
				283	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
				284	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
				285	shadow_mmio_value = mmio_value \| SPTE_SPECIAL_MASK;
				286	shadow_mmio_mask = mmio_mask \| SPTE_SPECIAL_MASK;
				287	}
				288	EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
				289
				290	static bool is_mmio_spte(u64 spte)
				291	{
				292	return (spte & shadow_mmio_mask) == shadow_mmio_value;
				293	}
				294
				295	static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
				296	{
				297	return sp->role.ad_disabled;
				298	}
				299
				300	static inline bool spte_ad_enabled(u64 spte)
				301	{
				302	MMU_WARN_ON(is_mmio_spte(spte));
				303	return !(spte & shadow_acc_track_value);
				304	}
				305
				306	static bool is_nx_huge_page_enabled(void)
				307	{
				308	return READ_ONCE(nx_huge_pages);
				309	}
				310
				311	static inline u64 spte_shadow_accessed_mask(u64 spte)
				312	{
				313	MMU_WARN_ON(is_mmio_spte(spte));
				314	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
				315	}
				316
				317	static inline u64 spte_shadow_dirty_mask(u64 spte)
				318	{
				319	MMU_WARN_ON(is_mmio_spte(spte));
				320	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
				321	}
				322
				323	static inline bool is_access_track_spte(u64 spte)
				324	{
				325	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
				326	}
				327
				328	/*
				329	* the low bit of the generation number is always presumed to be zero.
				330	* This disables mmio caching during memslot updates. The concept is
				331	* similar to a seqcount but instead of retrying the access we just punt
				332	* and ignore the cache.
				333	*
				334	* spte bits 3-11 are used as bits 1-9 of the generation number,
				335	* the bits 52-61 are used as bits 10-19 of the generation number.
				336	*/
				337	#define MMIO_SPTE_GEN_LOW_SHIFT 2
				338	#define MMIO_SPTE_GEN_HIGH_SHIFT 52
				339
				340	#define MMIO_GEN_SHIFT 20
				341	#define MMIO_GEN_LOW_SHIFT 10
				342	#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
				343	#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
				344
				345	static u64 generation_mmio_spte_mask(unsigned int gen)
				346	{
				347	u64 mask;
				348
				349	WARN_ON(gen & ~MMIO_GEN_MASK);
				350
				351	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
				352	mask \|= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
				353	return mask;
				354	}
				355
				356	static unsigned int get_mmio_spte_generation(u64 spte)
				357	{
				358	unsigned int gen;
				359
				360	spte &= ~shadow_mmio_mask;
				361
				362	gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
				363	gen \|= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
				364	return gen;
				365	}
				366
				367	static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
				368	{
				369	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
				370	}
				371
				372	static void mark_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, u64 gfn,
				373	unsigned access)
				374	{
				375	unsigned int gen = kvm_current_mmio_generation(vcpu);
				376	u64 mask = generation_mmio_spte_mask(gen);
				377	u64 gpa = gfn << PAGE_SHIFT;
				378
				379	access &= ACC_WRITE_MASK \| ACC_USER_MASK;
				380	mask \|= shadow_mmio_value \| access;
				381	mask \|= gpa \| shadow_nonpresent_or_rsvd_mask;
				382	mask \|= (gpa & shadow_nonpresent_or_rsvd_mask)
				383	<< shadow_nonpresent_or_rsvd_mask_len;
				384
				385	trace_mark_mmio_spte(sptep, gfn, access, gen);
				386	mmu_spte_set(sptep, mask);
				387	}
				388
				389	static gfn_t get_mmio_spte_gfn(u64 spte)
				390	{
				391	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
				392
				393	gpa \|= (spte >> shadow_nonpresent_or_rsvd_mask_len)
				394	& shadow_nonpresent_or_rsvd_mask;
				395
				396	return gpa >> PAGE_SHIFT;
				397	}
				398
				399	static unsigned get_mmio_spte_access(u64 spte)
				400	{
				401	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) \| shadow_mmio_mask;
				402	return (spte & ~mask) & ~PAGE_MASK;
				403	}
				404
				405	static bool set_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
				406	kvm_pfn_t pfn, unsigned access)
				407	{
				408	if (unlikely(is_noslot_pfn(pfn))) {
				409	mark_mmio_spte(vcpu, sptep, gfn, access);
				410	return true;
				411	}
				412
				413	return false;
				414	}
				415
				416	static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
				417	{
				418	unsigned int kvm_gen, spte_gen;
				419
				420	kvm_gen = kvm_current_mmio_generation(vcpu);
				421	spte_gen = get_mmio_spte_generation(spte);
				422
				423	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
				424	return likely(kvm_gen == spte_gen);
				425	}
				426
				427	/*
				428	* Sets the shadow PTE masks used by the MMU.
				429	*
				430	* Assumptions:
				431	* - Setting either @accessed_mask or @dirty_mask requires setting both
				432	* - At least one of @accessed_mask or @acc_track_mask must be set
				433	*/
				434	void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
				435	u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
				436	u64 acc_track_mask, u64 me_mask)
				437	{
				438	BUG_ON(!dirty_mask != !accessed_mask);
				439	BUG_ON(!accessed_mask && !acc_track_mask);
				440	BUG_ON(acc_track_mask & shadow_acc_track_value);
				441
				442	shadow_user_mask = user_mask;
				443	shadow_accessed_mask = accessed_mask;
				444	shadow_dirty_mask = dirty_mask;
				445	shadow_nx_mask = nx_mask;
				446	shadow_x_mask = x_mask;
				447	shadow_present_mask = p_mask;
				448	shadow_acc_track_mask = acc_track_mask;
				449	shadow_me_mask = me_mask;
				450	}
				451	EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
				452
				453	static u8 kvm_get_shadow_phys_bits(void)
				454	{
				455	/*
				456	* boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
				457	* in CPU detection code, but MKTME treats those reduced bits as
				458	* 'keyID' thus they are not reserved bits. Therefore for MKTME
				459	* we should still return physical address bits reported by CPUID.
				460	*/
				461	if (!boot_cpu_has(X86_FEATURE_TME) \|\|
				462	WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
				463	return boot_cpu_data.x86_phys_bits;
				464
				465	return cpuid_eax(0x80000008) & 0xff;
				466	}
				467
				468	static void kvm_mmu_reset_all_pte_masks(void)
				469	{
				470	u8 low_phys_bits;
				471
				472	shadow_user_mask = 0;
				473	shadow_accessed_mask = 0;
				474	shadow_dirty_mask = 0;
				475	shadow_nx_mask = 0;
				476	shadow_x_mask = 0;
				477	shadow_mmio_mask = 0;
				478	shadow_present_mask = 0;
				479	shadow_acc_track_mask = 0;
				480
				481	shadow_phys_bits = kvm_get_shadow_phys_bits();
				482
				483	/*
				484	* If the CPU has 46 or less physical address bits, then set an
				485	* appropriate mask to guard against L1TF attacks. Otherwise, it is
				486	* assumed that the CPU is not vulnerable to L1TF.
				487	*
				488	* Some Intel CPUs address the L1 cache using more PA bits than are
				489	* reported by CPUID. Use the PA width of the L1 cache when possible
				490	* to achieve more effective mitigation, e.g. if system RAM overlaps
				491	* the most significant bits of legal physical address space.
				492	*/
				493	shadow_nonpresent_or_rsvd_mask = 0;
				494	low_phys_bits = boot_cpu_data.x86_phys_bits;
				495	if (boot_cpu_has_bug(X86_BUG_L1TF) &&
				496	!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
				497	52 - shadow_nonpresent_or_rsvd_mask_len)) {
				498	low_phys_bits = boot_cpu_data.x86_cache_bits
				499	- shadow_nonpresent_or_rsvd_mask_len;
				500	shadow_nonpresent_or_rsvd_mask =
				501	rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
				502	}
				503
				504	shadow_nonpresent_or_rsvd_lower_gfn_mask =
				505	GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
				506	}
				507
				508	static int is_cpuid_PSE36(void)
				509	{
				510	return 1;
				511	}
				512
				513	static int is_nx(struct kvm_vcpu *vcpu)
				514	{
				515	return vcpu->arch.efer & EFER_NX;
				516	}
				517
				518	static int is_shadow_present_pte(u64 pte)
				519	{
				520	return (pte != 0) && !is_mmio_spte(pte);
				521	}
				522
				523	static int is_large_pte(u64 pte)
				524	{
				525	return pte & PT_PAGE_SIZE_MASK;
				526	}
				527
				528	static int is_last_spte(u64 pte, int level)
				529	{
				530	if (level == PT_PAGE_TABLE_LEVEL)
				531	return 1;
				532	if (is_large_pte(pte))
				533	return 1;
				534	return 0;
				535	}
				536
				537	static bool is_executable_pte(u64 spte)
				538	{
				539	return (spte & (shadow_x_mask \| shadow_nx_mask)) == shadow_x_mask;
				540	}
				541
				542	static kvm_pfn_t spte_to_pfn(u64 pte)
				543	{
				544	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
				545	}
				546
				547	static gfn_t pse36_gfn_delta(u32 gpte)
				548	{
				549	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
				550
				551	return (gpte & PT32_DIR_PSE36_MASK) << shift;
				552	}
				553
				554	#ifdef CONFIG_X86_64
				555	static void __set_spte(u64 *sptep, u64 spte)
				556	{
				557	WRITE_ONCE(*sptep, spte);
				558	}
				559
				560	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
				561	{
				562	WRITE_ONCE(*sptep, spte);
				563	}
				564
				565	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
				566	{
				567	return xchg(sptep, spte);
				568	}
				569
				570	static u64 __get_spte_lockless(u64 *sptep)
				571	{
				572	return ACCESS_ONCE(*sptep);
				573	}
				574	#else
				575	union split_spte {
				576	struct {
				577	u32 spte_low;
				578	u32 spte_high;
				579	};
				580	u64 spte;
				581	};
				582
				583	static void count_spte_clear(u64 *sptep, u64 spte)
				584	{
				585	struct kvm_mmu_page *sp = page_header(__pa(sptep));
				586
				587	if (is_shadow_present_pte(spte))
				588	return;
				589
				590	/* Ensure the spte is completely set before we increase the count */
				591	smp_wmb();
				592	sp->clear_spte_count++;
				593	}
				594
				595	static void __set_spte(u64 *sptep, u64 spte)
				596	{
				597	union split_spte *ssptep, sspte;
				598
				599	ssptep = (union split_spte *)sptep;
				600	sspte = (union split_spte)spte;
				601
				602	ssptep->spte_high = sspte.spte_high;
				603
				604	/*
				605	* If we map the spte from nonpresent to present, We should store
				606	* the high bits firstly, then set present bit, so cpu can not
				607	* fetch this spte while we are setting the spte.
				608	*/
				609	smp_wmb();
				610
				611	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
				612	}
				613
				614	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
				615	{
				616	union split_spte *ssptep, sspte;
				617
				618	ssptep = (union split_spte *)sptep;
				619	sspte = (union split_spte)spte;
				620
				621	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
				622
				623	/*
				624	* If we map the spte from present to nonpresent, we should clear
				625	* present bit firstly to avoid vcpu fetch the old high bits.
				626	*/
				627	smp_wmb();
				628
				629	ssptep->spte_high = sspte.spte_high;
				630	count_spte_clear(sptep, spte);
				631	}
				632
				633	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
				634	{
				635	union split_spte *ssptep, sspte, orig;
				636
				637	ssptep = (union split_spte *)sptep;
				638	sspte = (union split_spte)spte;
				639
				640	/* xchg acts as a barrier before the setting of the high bits */
				641	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
				642	orig.spte_high = ssptep->spte_high;
				643	ssptep->spte_high = sspte.spte_high;
				644	count_spte_clear(sptep, spte);
				645
				646	return orig.spte;
				647	}
				648
				649	/*
				650	* The idea using the light way get the spte on x86_32 guest is from
				651	* gup_get_pte(arch/x86/mm/gup.c).
				652	*
				653	* An spte tlb flush may be pending, because kvm_set_pte_rmapp
				654	* coalesces them and we are running out of the MMU lock. Therefore
				655	* we need to protect against in-progress updates of the spte.
				656	*
				657	* Reading the spte while an update is in progress may get the old value
				658	* for the high part of the spte. The race is fine for a present->non-present
				659	* change (because the high part of the spte is ignored for non-present spte),
				660	* but for a present->present change we must reread the spte.
				661	*
				662	* All such changes are done in two steps (present->non-present and
				663	* non-present->present), hence it is enough to count the number of
				664	* present->non-present updates: if it changed while reading the spte,
				665	* we might have hit the race. This is done using clear_spte_count.
				666	*/
				667	static u64 __get_spte_lockless(u64 *sptep)
				668	{
				669	struct kvm_mmu_page *sp = page_header(__pa(sptep));
				670	union split_spte spte, orig = (union split_spte )sptep;
				671	int count;
				672
				673	retry:
				674	count = sp->clear_spte_count;
				675	smp_rmb();
				676
				677	spte.spte_low = orig->spte_low;
				678	smp_rmb();
				679
				680	spte.spte_high = orig->spte_high;
				681	smp_rmb();
				682
				683	if (unlikely(spte.spte_low != orig->spte_low \|\|
				684	count != sp->clear_spte_count))
				685	goto retry;
				686
				687	return spte.spte;
				688	}
				689	#endif
				690
				691	static bool spte_can_locklessly_be_made_writable(u64 spte)
				692	{
				693	return (spte & (SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE)) ==
				694	(SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE);
				695	}
				696
				697	static bool spte_has_volatile_bits(u64 spte)
				698	{
				699	if (!is_shadow_present_pte(spte))
				700	return false;
				701
				702	/*
				703	* Always atomically update spte if it can be updated
				704	* out of mmu-lock, it can ensure dirty bit is not lost,
				705	* also, it can help us to get a stable is_writable_pte()
				706	* to ensure tlb flush is not missed.
				707	*/
				708	if (spte_can_locklessly_be_made_writable(spte) \|\|
				709	is_access_track_spte(spte))
				710	return true;
				711
				712	if (spte_ad_enabled(spte)) {
				713	if ((spte & shadow_accessed_mask) == 0 \|\|
				714	(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
				715	return true;
				716	}
				717
				718	return false;
				719	}
				720
				721	static bool is_accessed_spte(u64 spte)
				722	{
				723	u64 accessed_mask = spte_shadow_accessed_mask(spte);
				724
				725	return accessed_mask ? spte & accessed_mask
				726	: !is_access_track_spte(spte);
				727	}
				728
				729	static bool is_dirty_spte(u64 spte)
				730	{
				731	u64 dirty_mask = spte_shadow_dirty_mask(spte);
				732
				733	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
				734	}
				735
				736	/* Rules for using mmu_spte_set:
				737	* Set the sptep from nonpresent to present.
				738	* Note: the sptep being assigned must be either not present
				739	* or in a state where the hardware will not attempt to update
				740	* the spte.
				741	*/
				742	static void mmu_spte_set(u64 *sptep, u64 new_spte)
				743	{
				744	WARN_ON(is_shadow_present_pte(*sptep));
				745	__set_spte(sptep, new_spte);
				746	}
				747
				748	/*
				749	* Update the SPTE (excluding the PFN), but do not track changes in its
				750	* accessed/dirty status.
				751	*/
				752	static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
				753	{
				754	u64 old_spte = *sptep;
				755
				756	WARN_ON(!is_shadow_present_pte(new_spte));
				757
				758	if (!is_shadow_present_pte(old_spte)) {
				759	mmu_spte_set(sptep, new_spte);
				760	return old_spte;
				761	}
				762
				763	if (!spte_has_volatile_bits(old_spte))
				764	__update_clear_spte_fast(sptep, new_spte);
				765	else
				766	old_spte = __update_clear_spte_slow(sptep, new_spte);
				767
				768	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
				769
				770	return old_spte;
				771	}
				772
				773	/* Rules for using mmu_spte_update:
				774	* Update the state bits, it means the mapped pfn is not changed.
				775	*
				776	* Whenever we overwrite a writable spte with a read-only one we
				777	* should flush remote TLBs. Otherwise rmap_write_protect
				778	* will find a read-only spte, even though the writable spte
				779	* might be cached on a CPU's TLB, the return value indicates this
				780	* case.
				781	*
				782	* Returns true if the TLB needs to be flushed
				783	*/
				784	static bool mmu_spte_update(u64 *sptep, u64 new_spte)
				785	{
				786	bool flush = false;
				787	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
				788
				789	if (!is_shadow_present_pte(old_spte))
				790	return false;
				791
				792	/*
				793	* For the spte updated out of mmu-lock is safe, since
				794	* we always atomically update it, see the comments in
				795	* spte_has_volatile_bits().
				796	*/
				797	if (spte_can_locklessly_be_made_writable(old_spte) &&
				798	!is_writable_pte(new_spte))
				799	flush = true;
				800
				801	/*
				802	* Flush TLB when accessed/dirty states are changed in the page tables,
				803	* to guarantee consistency between TLB and page tables.
				804	*/
				805
				806	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
				807	flush = true;
				808	kvm_set_pfn_accessed(spte_to_pfn(old_spte));
				809	}
				810
				811	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
				812	flush = true;
				813	kvm_set_pfn_dirty(spte_to_pfn(old_spte));
				814	}
				815
				816	return flush;
				817	}
				818
				819	/*
				820	* Rules for using mmu_spte_clear_track_bits:
				821	* It sets the sptep from present to nonpresent, and track the
				822	* state bits, it is used to clear the last level sptep.
				823	* Returns non-zero if the PTE was previously valid.
				824	*/
				825	static int mmu_spte_clear_track_bits(u64 *sptep)
				826	{
				827	kvm_pfn_t pfn;
				828	u64 old_spte = *sptep;
				829
				830	if (!spte_has_volatile_bits(old_spte))
				831	__update_clear_spte_fast(sptep, 0ull);
				832	else
				833	old_spte = __update_clear_spte_slow(sptep, 0ull);
				834
				835	if (!is_shadow_present_pte(old_spte))
				836	return 0;
				837
				838	pfn = spte_to_pfn(old_spte);
				839
				840	/*
				841	* KVM does not hold the refcount of the page used by
				842	* kvm mmu, before reclaiming the page, we should
				843	* unmap it from mmu first.
				844	*/
				845	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
				846
				847	if (is_accessed_spte(old_spte))
				848	kvm_set_pfn_accessed(pfn);
				849
				850	if (is_dirty_spte(old_spte))
				851	kvm_set_pfn_dirty(pfn);
				852
				853	return 1;
				854	}
				855
				856	/*
				857	* Rules for using mmu_spte_clear_no_track:
				858	* Directly clear spte without caring the state bits of sptep,
				859	* it is used to set the upper level spte.
				860	*/
				861	static void mmu_spte_clear_no_track(u64 *sptep)
				862	{
				863	__update_clear_spte_fast(sptep, 0ull);
				864	}
				865
				866	static u64 mmu_spte_get_lockless(u64 *sptep)
				867	{
				868	return __get_spte_lockless(sptep);
				869	}
				870
				871	static u64 mark_spte_for_access_track(u64 spte)
				872	{
				873	if (spte_ad_enabled(spte))
				874	return spte & ~shadow_accessed_mask;
				875
				876	if (is_access_track_spte(spte))
				877	return spte;
				878
				879	/*
				880	* Making an Access Tracking PTE will result in removal of write access
				881	* from the PTE. So, verify that we will be able to restore the write
				882	* access in the fast page fault path later on.
				883	*/
				884	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
				885	!spte_can_locklessly_be_made_writable(spte),
				886	"kvm: Writable SPTE is not locklessly dirty-trackable\n");
				887
				888	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
				889	shadow_acc_track_saved_bits_shift),
				890	"kvm: Access Tracking saved bit locations are not zero\n");
				891
				892	spte \|= (spte & shadow_acc_track_saved_bits_mask) <<
				893	shadow_acc_track_saved_bits_shift;
				894	spte &= ~shadow_acc_track_mask;
				895
				896	return spte;
				897	}
				898
				899	/* Restore an acc-track PTE back to a regular PTE */
				900	static u64 restore_acc_track_spte(u64 spte)
				901	{
				902	u64 new_spte = spte;
				903	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
				904	& shadow_acc_track_saved_bits_mask;
				905
				906	WARN_ON_ONCE(spte_ad_enabled(spte));
				907	WARN_ON_ONCE(!is_access_track_spte(spte));
				908
				909	new_spte &= ~shadow_acc_track_mask;
				910	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
				911	shadow_acc_track_saved_bits_shift);
				912	new_spte \|= saved_bits;
				913
				914	return new_spte;
				915	}
				916
				917	/* Returns the Accessed status of the PTE and resets it at the same time. */
				918	static bool mmu_spte_age(u64 *sptep)
				919	{
				920	u64 spte = mmu_spte_get_lockless(sptep);
				921
				922	if (!is_accessed_spte(spte))
				923	return false;
				924
				925	if (spte_ad_enabled(spte)) {
				926	clear_bit((ffs(shadow_accessed_mask) - 1),
				927	(unsigned long *)sptep);
				928	} else {
				929	/*
				930	* Capture the dirty status of the page, so that it doesn't get
				931	* lost when the SPTE is marked for access tracking.
				932	*/
				933	if (is_writable_pte(spte))
				934	kvm_set_pfn_dirty(spte_to_pfn(spte));
				935
				936	spte = mark_spte_for_access_track(spte);
				937	mmu_spte_update_no_track(sptep, spte);
				938	}
				939
				940	return true;
				941	}
				942
				943	static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
				944	{
				945	/*
				946	* Prevent page table teardown by making any free-er wait during
				947	* kvm_flush_remote_tlbs() IPI to all active vcpus.
				948	*/
				949	local_irq_disable();
				950
				951	/*
				952	* Make sure a following spte read is not reordered ahead of the write
				953	* to vcpu->mode.
				954	*/
				955	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
				956	}
				957
				958	static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
				959	{
				960	/*
				961	* Make sure the write to vcpu->mode is not reordered in front of
				962	* reads to sptes. If it does, kvm_commit_zap_page() can see us
				963	* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
				964	*/
				965	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
				966	local_irq_enable();
				967	}
				968
				969	static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
				970	struct kmem_cache *base_cache, int min)
				971	{
				972	void *obj;
				973
				974	if (cache->nobjs >= min)
				975	return 0;
				976	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
				977	obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
				978	if (!obj)
				979	return -ENOMEM;
				980	cache->objects[cache->nobjs++] = obj;
				981	}
				982	return 0;
				983	}
				984
				985	static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
				986	{
				987	return cache->nobjs;
				988	}
				989
				990	static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				991	struct kmem_cache *cache)
				992	{
				993	while (mc->nobjs)
				994	kmem_cache_free(cache, mc->objects[--mc->nobjs]);
				995	}
				996
				997	static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
				998	int min)
				999	{
				1000	void *page;
				1001
				1002	if (cache->nobjs >= min)
				1003	return 0;
				1004	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
				1005	page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
				1006	if (!page)
				1007	return -ENOMEM;
				1008	cache->objects[cache->nobjs++] = page;
				1009	}
				1010	return 0;
				1011	}
				1012
				1013	static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
				1014	{
				1015	while (mc->nobjs)
				1016	free_page((unsigned long)mc->objects[--mc->nobjs]);
				1017	}
				1018
				1019	static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
				1020	{
				1021	int r;
				1022
				1023	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				1024	pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
				1025	if (r)
				1026	goto out;
				1027	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
				1028	if (r)
				1029	goto out;
				1030	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
				1031	mmu_page_header_cache, 4);
				1032	out:
				1033	return r;
				1034	}
				1035
				1036	static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
				1037	{
				1038	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				1039	pte_list_desc_cache);
				1040	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
				1041	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
				1042	mmu_page_header_cache);
				1043	}
				1044
				1045	static void mmu_memory_cache_alloc(struct kvm_mmu_memory_cache mc)
				1046	{
				1047	void *p;
				1048
				1049	BUG_ON(!mc->nobjs);
				1050	p = mc->objects[--mc->nobjs];
				1051	return p;
				1052	}
				1053
				1054	static struct pte_list_desc mmu_alloc_pte_list_desc(struct kvm_vcpu vcpu)
				1055	{
				1056	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
				1057	}
				1058
				1059	static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
				1060	{
				1061	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
				1062	}
				1063
				1064	static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
				1065	{
				1066	if (!sp->role.direct)
				1067	return sp->gfns[index];
				1068
				1069	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
				1070	}
				1071
				1072	static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
				1073	{
				1074	if (!sp->role.direct) {
				1075	sp->gfns[index] = gfn;
				1076	return;
				1077	}
				1078
				1079	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
				1080	pr_err_ratelimited("gfn mismatch under direct page %llx "
				1081	"(expected %llx, got %llx)\n",
				1082	sp->gfn,
				1083	kvm_mmu_page_get_gfn(sp, index), gfn);
				1084	}
				1085
				1086	/*
				1087	* Return the pointer to the large page information for a given gfn,
				1088	* handling slots that are not large page aligned.
				1089	*/
				1090	static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
				1091	struct kvm_memory_slot *slot,
				1092	int level)
				1093	{
				1094	unsigned long idx;
				1095
				1096	idx = gfn_to_index(gfn, slot->base_gfn, level);
				1097	return &slot->arch.lpage_info[level - 2][idx];
				1098	}
				1099
				1100	static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
				1101	gfn_t gfn, int count)
				1102	{
				1103	struct kvm_lpage_info *linfo;
				1104	int i;
				1105
				1106	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1107	linfo = lpage_info_slot(gfn, slot, i);
				1108	linfo->disallow_lpage += count;
				1109	WARN_ON(linfo->disallow_lpage < 0);
				1110	}
				1111	}
				1112
				1113	void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
				1114	{
				1115	update_gfn_disallow_lpage_count(slot, gfn, 1);
				1116	}
				1117
				1118	void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
				1119	{
				1120	update_gfn_disallow_lpage_count(slot, gfn, -1);
				1121	}
				1122
				1123	static void account_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
				1124	{
				1125	struct kvm_memslots *slots;
				1126	struct kvm_memory_slot *slot;
				1127	gfn_t gfn;
				1128
				1129	kvm->arch.indirect_shadow_pages++;
				1130	gfn = sp->gfn;
				1131	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1132	slot = __gfn_to_memslot(slots, gfn);
				1133
				1134	/* the non-leaf shadow pages are keeping readonly. */
				1135	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				1136	return kvm_slot_page_track_add_page(kvm, slot, gfn,
				1137	KVM_PAGE_TRACK_WRITE);
				1138
				1139	kvm_mmu_gfn_disallow_lpage(slot, gfn);
				1140	}
				1141
				1142	static void account_huge_nx_page(struct kvm kvm, struct kvm_mmu_page sp)
				1143	{
				1144	if (sp->lpage_disallowed)
				1145	return;
				1146
				1147	++kvm->stat.nx_lpage_splits;
				1148	list_add_tail(&sp->lpage_disallowed_link,
				1149	&kvm->arch.lpage_disallowed_mmu_pages);
				1150	sp->lpage_disallowed = true;
				1151	}
				1152
				1153	static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
				1154	{
				1155	struct kvm_memslots *slots;
				1156	struct kvm_memory_slot *slot;
				1157	gfn_t gfn;
				1158
				1159	kvm->arch.indirect_shadow_pages--;
				1160	gfn = sp->gfn;
				1161	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1162	slot = __gfn_to_memslot(slots, gfn);
				1163	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				1164	return kvm_slot_page_track_remove_page(kvm, slot, gfn,
				1165	KVM_PAGE_TRACK_WRITE);
				1166
				1167	kvm_mmu_gfn_allow_lpage(slot, gfn);
				1168	}
				1169
				1170	static void unaccount_huge_nx_page(struct kvm kvm, struct kvm_mmu_page sp)
				1171	{
				1172	--kvm->stat.nx_lpage_splits;
				1173	sp->lpage_disallowed = false;
				1174	list_del(&sp->lpage_disallowed_link);
				1175	}
				1176
				1177	static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
				1178	struct kvm_memory_slot *slot)
				1179	{
				1180	struct kvm_lpage_info *linfo;
				1181
				1182	if (slot) {
				1183	linfo = lpage_info_slot(gfn, slot, level);
				1184	return !!linfo->disallow_lpage;
				1185	}
				1186
				1187	return true;
				1188	}
				1189
				1190	static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
				1191	int level)
				1192	{
				1193	struct kvm_memory_slot *slot;
				1194
				1195	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1196	return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
				1197	}
				1198
				1199	static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
				1200	{
				1201	unsigned long page_size;
				1202	int i, ret = 0;
				1203
				1204	page_size = kvm_host_page_size(vcpu, gfn);
				1205
				1206	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1207	if (page_size >= KVM_HPAGE_SIZE(i))
				1208	ret = i;
				1209	else
				1210	break;
				1211	}
				1212
				1213	return ret;
				1214	}
				1215
				1216	static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
				1217	bool no_dirty_log)
				1218	{
				1219	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
				1220	return false;
				1221	if (no_dirty_log && slot->dirty_bitmap)
				1222	return false;
				1223
				1224	return true;
				1225	}
				1226
				1227	static struct kvm_memory_slot *
				1228	gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
				1229	bool no_dirty_log)
				1230	{
				1231	struct kvm_memory_slot *slot;
				1232
				1233	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1234	if (!memslot_valid_for_gpte(slot, no_dirty_log))
				1235	slot = NULL;
				1236
				1237	return slot;
				1238	}
				1239
				1240	static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
				1241	bool *force_pt_level)
				1242	{
				1243	int host_level, level, max_level;
				1244	struct kvm_memory_slot *slot;
				1245
				1246	if (unlikely(*force_pt_level))
				1247	return PT_PAGE_TABLE_LEVEL;
				1248
				1249	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
				1250	*force_pt_level = !memslot_valid_for_gpte(slot, true);
				1251	if (unlikely(*force_pt_level))
				1252	return PT_PAGE_TABLE_LEVEL;
				1253
				1254	host_level = host_mapping_level(vcpu, large_gfn);
				1255
				1256	if (host_level == PT_PAGE_TABLE_LEVEL)
				1257	return host_level;
				1258
				1259	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
				1260
				1261	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
				1262	if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
				1263	break;
				1264
				1265	return level - 1;
				1266	}
				1267
				1268	/*
				1269	* About rmap_head encoding:
				1270	*
				1271	* If the bit zero of rmap_head->val is clear, then it points to the only spte
				1272	* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
				1273	* pte_list_desc containing more mappings.
				1274	*/
				1275
				1276	/*
				1277	* Returns the number of pointers in the rmap chain, not counting the new one.
				1278	*/
				1279	static int pte_list_add(struct kvm_vcpu vcpu, u64 spte,
				1280	struct kvm_rmap_head *rmap_head)
				1281	{
				1282	struct pte_list_desc *desc;
				1283	int i, count = 0;
				1284
				1285	if (!rmap_head->val) {
				1286	rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
				1287	rmap_head->val = (unsigned long)spte;
				1288	} else if (!(rmap_head->val & 1)) {
				1289	rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
				1290	desc = mmu_alloc_pte_list_desc(vcpu);
				1291	desc->sptes[0] = (u64 *)rmap_head->val;
				1292	desc->sptes[1] = spte;
				1293	rmap_head->val = (unsigned long)desc \| 1;
				1294	++count;
				1295	} else {
				1296	rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
				1297	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1298	while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
				1299	desc = desc->more;
				1300	count += PTE_LIST_EXT;
				1301	}
				1302	if (desc->sptes[PTE_LIST_EXT-1]) {
				1303	desc->more = mmu_alloc_pte_list_desc(vcpu);
				1304	desc = desc->more;
				1305	}
				1306	for (i = 0; desc->sptes[i]; ++i)
				1307	++count;
				1308	desc->sptes[i] = spte;
				1309	}
				1310	return count;
				1311	}
				1312
				1313	static void
				1314	pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
				1315	struct pte_list_desc *desc, int i,
				1316	struct pte_list_desc *prev_desc)
				1317	{
				1318	int j;
				1319
				1320	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
				1321	;
				1322	desc->sptes[i] = desc->sptes[j];
				1323	desc->sptes[j] = NULL;
				1324	if (j != 0)
				1325	return;
				1326	if (!prev_desc && !desc->more)
				1327	rmap_head->val = (unsigned long)desc->sptes[0];
				1328	else
				1329	if (prev_desc)
				1330	prev_desc->more = desc->more;
				1331	else
				1332	rmap_head->val = (unsigned long)desc->more \| 1;
				1333	mmu_free_pte_list_desc(desc);
				1334	}
				1335
				1336	static void pte_list_remove(u64 spte, struct kvm_rmap_head rmap_head)
				1337	{
				1338	struct pte_list_desc *desc;
				1339	struct pte_list_desc *prev_desc;
				1340	int i;
				1341
				1342	if (!rmap_head->val) {
				1343	printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
				1344	BUG();
				1345	} else if (!(rmap_head->val & 1)) {
				1346	rmap_printk("pte_list_remove: %p 1->0\n", spte);
				1347	if ((u64 *)rmap_head->val != spte) {
				1348	printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
				1349	BUG();
				1350	}
				1351	rmap_head->val = 0;
				1352	} else {
				1353	rmap_printk("pte_list_remove: %p many->many\n", spte);
				1354	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1355	prev_desc = NULL;
				1356	while (desc) {
				1357	for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
				1358	if (desc->sptes[i] == spte) {
				1359	pte_list_desc_remove_entry(rmap_head,
				1360	desc, i, prev_desc);
				1361	return;
				1362	}
				1363	}
				1364	prev_desc = desc;
				1365	desc = desc->more;
				1366	}
				1367	pr_err("pte_list_remove: %p many->many\n", spte);
				1368	BUG();
				1369	}
				1370	}
				1371
				1372	static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
				1373	struct kvm_memory_slot *slot)
				1374	{
				1375	unsigned long idx;
				1376
				1377	idx = gfn_to_index(gfn, slot->base_gfn, level);
				1378	return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
				1379	}
				1380
				1381	static struct kvm_rmap_head gfn_to_rmap(struct kvm kvm, gfn_t gfn,
				1382	struct kvm_mmu_page *sp)
				1383	{
				1384	struct kvm_memslots *slots;
				1385	struct kvm_memory_slot *slot;
				1386
				1387	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1388	slot = __gfn_to_memslot(slots, gfn);
				1389	return __gfn_to_rmap(gfn, sp->role.level, slot);
				1390	}
				1391
				1392	static bool rmap_can_add(struct kvm_vcpu *vcpu)
				1393	{
				1394	struct kvm_mmu_memory_cache *cache;
				1395
				1396	cache = &vcpu->arch.mmu_pte_list_desc_cache;
				1397	return mmu_memory_cache_free_objects(cache);
				1398	}
				1399
				1400	static int rmap_add(struct kvm_vcpu vcpu, u64 spte, gfn_t gfn)
				1401	{
				1402	struct kvm_mmu_page *sp;
				1403	struct kvm_rmap_head *rmap_head;
				1404
				1405	sp = page_header(__pa(spte));
				1406	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
				1407	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
				1408	return pte_list_add(vcpu, spte, rmap_head);
				1409	}
				1410
				1411	static void rmap_remove(struct kvm kvm, u64 spte)
				1412	{
				1413	struct kvm_mmu_page *sp;
				1414	gfn_t gfn;
				1415	struct kvm_rmap_head *rmap_head;
				1416
				1417	sp = page_header(__pa(spte));
				1418	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
				1419	rmap_head = gfn_to_rmap(kvm, gfn, sp);
				1420	pte_list_remove(spte, rmap_head);
				1421	}
				1422
				1423	/*
				1424	* Used by the following functions to iterate through the sptes linked by a
				1425	* rmap. All fields are private and not assumed to be used outside.
				1426	*/
				1427	struct rmap_iterator {
				1428	/* private fields */
				1429	struct pte_list_desc desc; / holds the sptep if not NULL */
				1430	int pos; /* index of the sptep */
				1431	};
				1432
				1433	/*
				1434	* Iteration must be started by this function. This should also be used after
				1435	* removing/dropping sptes from the rmap link because in such cases the
				1436	* information in the itererator may not be valid.
				1437	*
				1438	* Returns sptep if found, NULL otherwise.
				1439	*/
				1440	static u64 rmap_get_first(struct kvm_rmap_head rmap_head,
				1441	struct rmap_iterator *iter)
				1442	{
				1443	u64 *sptep;
				1444
				1445	if (!rmap_head->val)
				1446	return NULL;
				1447
				1448	if (!(rmap_head->val & 1)) {
				1449	iter->desc = NULL;
				1450	sptep = (u64 *)rmap_head->val;
				1451	goto out;
				1452	}
				1453
				1454	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1455	iter->pos = 0;
				1456	sptep = iter->desc->sptes[iter->pos];
				1457	out:
				1458	BUG_ON(!is_shadow_present_pte(*sptep));
				1459	return sptep;
				1460	}
				1461
				1462	/*
				1463	* Must be used with a valid iterator: e.g. after rmap_get_first().
				1464	*
				1465	* Returns sptep if found, NULL otherwise.
				1466	*/
				1467	static u64 rmap_get_next(struct rmap_iterator iter)
				1468	{
				1469	u64 *sptep;
				1470
				1471	if (iter->desc) {
				1472	if (iter->pos < PTE_LIST_EXT - 1) {
				1473	++iter->pos;
				1474	sptep = iter->desc->sptes[iter->pos];
				1475	if (sptep)
				1476	goto out;
				1477	}
				1478
				1479	iter->desc = iter->desc->more;
				1480
				1481	if (iter->desc) {
				1482	iter->pos = 0;
				1483	/* desc->sptes[0] cannot be NULL */
				1484	sptep = iter->desc->sptes[iter->pos];
				1485	goto out;
				1486	}
				1487	}
				1488
				1489	return NULL;
				1490	out:
				1491	BUG_ON(!is_shadow_present_pte(*sptep));
				1492	return sptep;
				1493	}
				1494
				1495	#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
				1496	for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
				1497	_spte_; _spte_ = rmap_get_next(_iter_))
				1498
				1499	static void drop_spte(struct kvm kvm, u64 sptep)
				1500	{
				1501	if (mmu_spte_clear_track_bits(sptep))
				1502	rmap_remove(kvm, sptep);
				1503	}
				1504
				1505
				1506	static bool __drop_large_spte(struct kvm kvm, u64 sptep)
				1507	{
				1508	if (is_large_pte(*sptep)) {
				1509	WARN_ON(page_header(__pa(sptep))->role.level ==
				1510	PT_PAGE_TABLE_LEVEL);
				1511	drop_spte(kvm, sptep);
				1512	--kvm->stat.lpages;
				1513	return true;
				1514	}
				1515
				1516	return false;
				1517	}
				1518
				1519	static void drop_large_spte(struct kvm_vcpu vcpu, u64 sptep)
				1520	{
				1521	if (__drop_large_spte(vcpu->kvm, sptep))
				1522	kvm_flush_remote_tlbs(vcpu->kvm);
				1523	}
				1524
				1525	/*
				1526	* Write-protect on the specified @sptep, @pt_protect indicates whether
				1527	* spte write-protection is caused by protecting shadow page table.
				1528	*
				1529	* Note: write protection is difference between dirty logging and spte
				1530	* protection:
				1531	* - for dirty logging, the spte can be set to writable at anytime if
				1532	* its dirty bitmap is properly set.
				1533	* - for spte protection, the spte can be writable only after unsync-ing
				1534	* shadow page.
				1535	*
				1536	* Return true if tlb need be flushed.
				1537	*/
				1538	static bool spte_write_protect(u64 *sptep, bool pt_protect)
				1539	{
				1540	u64 spte = *sptep;
				1541
				1542	if (!is_writable_pte(spte) &&
				1543	!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
				1544	return false;
				1545
				1546	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
				1547
				1548	if (pt_protect)
				1549	spte &= ~SPTE_MMU_WRITEABLE;
				1550	spte = spte & ~PT_WRITABLE_MASK;
				1551
				1552	return mmu_spte_update(sptep, spte);
				1553	}
				1554
				1555	static bool __rmap_write_protect(struct kvm *kvm,
				1556	struct kvm_rmap_head *rmap_head,
				1557	bool pt_protect)
				1558	{
				1559	u64 *sptep;
				1560	struct rmap_iterator iter;
				1561	bool flush = false;
				1562
				1563	for_each_rmap_spte(rmap_head, &iter, sptep)
				1564	flush \|= spte_write_protect(sptep, pt_protect);
				1565
				1566	return flush;
				1567	}
				1568
				1569	static bool spte_clear_dirty(u64 *sptep)
				1570	{
				1571	u64 spte = *sptep;
				1572
				1573	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
				1574
				1575	spte &= ~shadow_dirty_mask;
				1576
				1577	return mmu_spte_update(sptep, spte);
				1578	}
				1579
				1580	static bool wrprot_ad_disabled_spte(u64 *sptep)
				1581	{
				1582	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
				1583	(unsigned long *)sptep);
				1584	if (was_writable)
				1585	kvm_set_pfn_dirty(spte_to_pfn(*sptep));
				1586
				1587	return was_writable;
				1588	}
				1589
				1590	/*
				1591	* Gets the GFN ready for another round of dirty logging by clearing the
				1592	* - D bit on ad-enabled SPTEs, and
				1593	* - W bit on ad-disabled SPTEs.
				1594	* Returns true iff any D or W bits were cleared.
				1595	*/
				1596	static bool __rmap_clear_dirty(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1597	{
				1598	u64 *sptep;
				1599	struct rmap_iterator iter;
				1600	bool flush = false;
				1601
				1602	for_each_rmap_spte(rmap_head, &iter, sptep)
				1603	if (spte_ad_enabled(*sptep))
				1604	flush \|= spte_clear_dirty(sptep);
				1605	else
				1606	flush \|= wrprot_ad_disabled_spte(sptep);
				1607
				1608	return flush;
				1609	}
				1610
				1611	static bool spte_set_dirty(u64 *sptep)
				1612	{
				1613	u64 spte = *sptep;
				1614
				1615	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
				1616
				1617	spte \|= shadow_dirty_mask;
				1618
				1619	return mmu_spte_update(sptep, spte);
				1620	}
				1621
				1622	static bool __rmap_set_dirty(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1623	{
				1624	u64 *sptep;
				1625	struct rmap_iterator iter;
				1626	bool flush = false;
				1627
				1628	for_each_rmap_spte(rmap_head, &iter, sptep)
				1629	if (spte_ad_enabled(*sptep))
				1630	flush \|= spte_set_dirty(sptep);
				1631
				1632	return flush;
				1633	}
				1634
				1635	/**
				1636	* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
				1637	* @kvm: kvm instance
				1638	* @slot: slot to protect
				1639	* @gfn_offset: start of the BITS_PER_LONG pages we care about
				1640	* @mask: indicates which pages we should protect
				1641	*
				1642	* Used when we do not need to care about huge page mappings: e.g. during dirty
				1643	* logging we do not have any such mappings.
				1644	*/
				1645	static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
				1646	struct kvm_memory_slot *slot,
				1647	gfn_t gfn_offset, unsigned long mask)
				1648	{
				1649	struct kvm_rmap_head *rmap_head;
				1650
				1651	while (mask) {
				1652	rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
				1653	PT_PAGE_TABLE_LEVEL, slot);
				1654	__rmap_write_protect(kvm, rmap_head, false);
				1655
				1656	/* clear the first set bit */
				1657	mask &= mask - 1;
				1658	}
				1659	}
				1660
				1661	/**
				1662	* kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
				1663	* protect the page if the D-bit isn't supported.
				1664	* @kvm: kvm instance
				1665	* @slot: slot to clear D-bit
				1666	* @gfn_offset: start of the BITS_PER_LONG pages we care about
				1667	* @mask: indicates which pages we should clear D-bit
				1668	*
				1669	* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
				1670	*/
				1671	void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
				1672	struct kvm_memory_slot *slot,
				1673	gfn_t gfn_offset, unsigned long mask)
				1674	{
				1675	struct kvm_rmap_head *rmap_head;
				1676
				1677	while (mask) {
				1678	rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
				1679	PT_PAGE_TABLE_LEVEL, slot);
				1680	__rmap_clear_dirty(kvm, rmap_head);
				1681
				1682	/* clear the first set bit */
				1683	mask &= mask - 1;
				1684	}
				1685	}
				1686	EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
				1687
				1688	/**
				1689	* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
				1690	* PT level pages.
				1691	*
				1692	* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
				1693	* enable dirty logging for them.
				1694	*
				1695	* Used when we do not need to care about huge page mappings: e.g. during dirty
				1696	* logging we do not have any such mappings.
				1697	*/
				1698	void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
				1699	struct kvm_memory_slot *slot,
				1700	gfn_t gfn_offset, unsigned long mask)
				1701	{
				1702	if (kvm_x86_ops->enable_log_dirty_pt_masked)
				1703	kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
				1704	mask);
				1705	else
				1706	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
				1707	}
				1708
				1709	/**
				1710	* kvm_arch_write_log_dirty - emulate dirty page logging
				1711	* @vcpu: Guest mode vcpu
				1712	*
				1713	* Emulate arch specific page modification logging for the
				1714	* nested hypervisor
				1715	*/
				1716	int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
				1717	{
				1718	if (kvm_x86_ops->write_log_dirty)
				1719	return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa);
				1720
				1721	return 0;
				1722	}
				1723
				1724	bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
				1725	struct kvm_memory_slot *slot, u64 gfn)
				1726	{
				1727	struct kvm_rmap_head *rmap_head;
				1728	int i;
				1729	bool write_protected = false;
				1730
				1731	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1732	rmap_head = __gfn_to_rmap(gfn, i, slot);
				1733	write_protected \|= __rmap_write_protect(kvm, rmap_head, true);
				1734	}
				1735
				1736	return write_protected;
				1737	}
				1738
				1739	static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
				1740	{
				1741	struct kvm_memory_slot *slot;
				1742
				1743	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1744	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
				1745	}
				1746
				1747	static bool kvm_zap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1748	{
				1749	u64 *sptep;
				1750	struct rmap_iterator iter;
				1751	bool flush = false;
				1752
				1753	while ((sptep = rmap_get_first(rmap_head, &iter))) {
				1754	rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
				1755
				1756	drop_spte(kvm, sptep);
				1757	flush = true;
				1758	}
				1759
				1760	return flush;
				1761	}
				1762
				1763	static int kvm_unmap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1764	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1765	unsigned long data)
				1766	{
				1767	return kvm_zap_rmapp(kvm, rmap_head);
				1768	}
				1769
				1770	static int kvm_set_pte_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1771	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1772	unsigned long data)
				1773	{
				1774	u64 *sptep;
				1775	struct rmap_iterator iter;
				1776	int need_flush = 0;
				1777	u64 new_spte;
				1778	pte_t ptep = (pte_t )data;
				1779	kvm_pfn_t new_pfn;
				1780
				1781	WARN_ON(pte_huge(*ptep));
				1782	new_pfn = pte_pfn(*ptep);
				1783
				1784	restart:
				1785	for_each_rmap_spte(rmap_head, &iter, sptep) {
				1786	rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
				1787	sptep, *sptep, gfn, level);
				1788
				1789	need_flush = 1;
				1790
				1791	if (pte_write(*ptep)) {
				1792	drop_spte(kvm, sptep);
				1793	goto restart;
				1794	} else {
				1795	new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
				1796	new_spte \|= (u64)new_pfn << PAGE_SHIFT;
				1797
				1798	new_spte &= ~PT_WRITABLE_MASK;
				1799	new_spte &= ~SPTE_HOST_WRITEABLE;
				1800
				1801	new_spte = mark_spte_for_access_track(new_spte);
				1802
				1803	mmu_spte_clear_track_bits(sptep);
				1804	mmu_spte_set(sptep, new_spte);
				1805	}
				1806	}
				1807
				1808	if (need_flush)
				1809	kvm_flush_remote_tlbs(kvm);
				1810
				1811	return 0;
				1812	}
				1813
				1814	struct slot_rmap_walk_iterator {
				1815	/* input fields. */
				1816	struct kvm_memory_slot *slot;
				1817	gfn_t start_gfn;
				1818	gfn_t end_gfn;
				1819	int start_level;
				1820	int end_level;
				1821
				1822	/* output fields. */
				1823	gfn_t gfn;
				1824	struct kvm_rmap_head *rmap;
				1825	int level;
				1826
				1827	/* private field. */
				1828	struct kvm_rmap_head *end_rmap;
				1829	};
				1830
				1831	static void
				1832	rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
				1833	{
				1834	iterator->level = level;
				1835	iterator->gfn = iterator->start_gfn;
				1836	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
				1837	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
				1838	iterator->slot);
				1839	}
				1840
				1841	static void
				1842	slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
				1843	struct kvm_memory_slot *slot, int start_level,
				1844	int end_level, gfn_t start_gfn, gfn_t end_gfn)
				1845	{
				1846	iterator->slot = slot;
				1847	iterator->start_level = start_level;
				1848	iterator->end_level = end_level;
				1849	iterator->start_gfn = start_gfn;
				1850	iterator->end_gfn = end_gfn;
				1851
				1852	rmap_walk_init_level(iterator, iterator->start_level);
				1853	}
				1854
				1855	static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
				1856	{
				1857	return !!iterator->rmap;
				1858	}
				1859
				1860	static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
				1861	{
				1862	if (++iterator->rmap <= iterator->end_rmap) {
				1863	iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
				1864	return;
				1865	}
				1866
				1867	if (++iterator->level > iterator->end_level) {
				1868	iterator->rmap = NULL;
				1869	return;
				1870	}
				1871
				1872	rmap_walk_init_level(iterator, iterator->level);
				1873	}
				1874
				1875	#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
				1876	_start_gfn, _end_gfn, _iter_) \
				1877	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
				1878	_end_level_, _start_gfn, _end_gfn); \
				1879	slot_rmap_walk_okay(_iter_); \
				1880	slot_rmap_walk_next(_iter_))
				1881
				1882	static int kvm_handle_hva_range(struct kvm *kvm,
				1883	unsigned long start,
				1884	unsigned long end,
				1885	unsigned long data,
				1886	int (handler)(struct kvm kvm,
				1887	struct kvm_rmap_head *rmap_head,
				1888	struct kvm_memory_slot *slot,
				1889	gfn_t gfn,
				1890	int level,
				1891	unsigned long data))
				1892	{
				1893	struct kvm_memslots *slots;
				1894	struct kvm_memory_slot *memslot;
				1895	struct slot_rmap_walk_iterator iterator;
				1896	int ret = 0;
				1897	int i;
				1898
				1899	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				1900	slots = __kvm_memslots(kvm, i);
				1901	kvm_for_each_memslot(memslot, slots) {
				1902	unsigned long hva_start, hva_end;
				1903	gfn_t gfn_start, gfn_end;
				1904
				1905	hva_start = max(start, memslot->userspace_addr);
				1906	hva_end = min(end, memslot->userspace_addr +
				1907	(memslot->npages << PAGE_SHIFT));
				1908	if (hva_start >= hva_end)
				1909	continue;
				1910	/*
				1911	* {gfn(page) \| page intersects with [hva_start, hva_end)} =
				1912	* {gfn_start, gfn_start+1, ..., gfn_end-1}.
				1913	*/
				1914	gfn_start = hva_to_gfn_memslot(hva_start, memslot);
				1915	gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
				1916
				1917	for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
				1918	PT_MAX_HUGEPAGE_LEVEL,
				1919	gfn_start, gfn_end - 1,
				1920	&iterator)
				1921	ret \|= handler(kvm, iterator.rmap, memslot,
				1922	iterator.gfn, iterator.level, data);
				1923	}
				1924	}
				1925
				1926	return ret;
				1927	}
				1928
				1929	static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
				1930	unsigned long data,
				1931	int (handler)(struct kvm kvm,
				1932	struct kvm_rmap_head *rmap_head,
				1933	struct kvm_memory_slot *slot,
				1934	gfn_t gfn, int level,
				1935	unsigned long data))
				1936	{
				1937	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
				1938	}
				1939
				1940	int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
				1941	{
				1942	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
				1943	}
				1944
				1945	int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
				1946	{
				1947	return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
				1948	}
				1949
				1950	void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
				1951	{
				1952	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
				1953	}
				1954
				1955	static int kvm_age_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1956	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1957	unsigned long data)
				1958	{
				1959	u64 *sptep;
				1960	struct rmap_iterator uninitialized_var(iter);
				1961	int young = 0;
				1962
				1963	for_each_rmap_spte(rmap_head, &iter, sptep)
				1964	young \|= mmu_spte_age(sptep);
				1965
				1966	trace_kvm_age_page(gfn, level, slot, young);
				1967	return young;
				1968	}
				1969
				1970	static int kvm_test_age_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1971	struct kvm_memory_slot *slot, gfn_t gfn,
				1972	int level, unsigned long data)
				1973	{
				1974	u64 *sptep;
				1975	struct rmap_iterator iter;
				1976
				1977	for_each_rmap_spte(rmap_head, &iter, sptep)
				1978	if (is_accessed_spte(*sptep))
				1979	return 1;
				1980	return 0;
				1981	}
				1982
				1983	#define RMAP_RECYCLE_THRESHOLD 1000
				1984
				1985	static void rmap_recycle(struct kvm_vcpu vcpu, u64 spte, gfn_t gfn)
				1986	{
				1987	struct kvm_rmap_head *rmap_head;
				1988	struct kvm_mmu_page *sp;
				1989
				1990	sp = page_header(__pa(spte));
				1991
				1992	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
				1993
				1994	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
				1995	kvm_flush_remote_tlbs(vcpu->kvm);
				1996	}
				1997
				1998	int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
				1999	{
				2000	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
				2001	}
				2002
				2003	int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
				2004	{
				2005	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
				2006	}
				2007
				2008	#ifdef MMU_DEBUG
				2009	static int is_empty_shadow_page(u64 *spt)
				2010	{
				2011	u64 *pos;
				2012	u64 *end;
				2013
				2014	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
				2015	if (is_shadow_present_pte(*pos)) {
				2016	printk(KERN_ERR "%s: %p %llx\n", __func__,
				2017	pos, *pos);
				2018	return 0;
				2019	}
				2020	return 1;
				2021	}
				2022	#endif
				2023
				2024	/*
				2025	* This value is the sum of all of the kvm instances's
				2026	* kvm->arch.n_used_mmu_pages values. We need a global,
				2027	* aggregate version in order to make the slab shrinker
				2028	* faster
				2029	*/
				2030	static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
				2031	{
				2032	kvm->arch.n_used_mmu_pages += nr;
				2033	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
				2034	}
				2035
				2036	static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
				2037	{
				2038	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
				2039	hlist_del(&sp->hash_link);
				2040	list_del(&sp->link);
				2041	free_page((unsigned long)sp->spt);
				2042	if (!sp->role.direct)
				2043	free_page((unsigned long)sp->gfns);
				2044	kmem_cache_free(mmu_page_header_cache, sp);
				2045	}
				2046
				2047	static unsigned kvm_page_table_hashfn(gfn_t gfn)
				2048	{
				2049	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
				2050	}
				2051
				2052	static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
				2053	struct kvm_mmu_page sp, u64 parent_pte)
				2054	{
				2055	if (!parent_pte)
				2056	return;
				2057
				2058	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
				2059	}
				2060
				2061	static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
				2062	u64 *parent_pte)
				2063	{
				2064	pte_list_remove(parent_pte, &sp->parent_ptes);
				2065	}
				2066
				2067	static void drop_parent_pte(struct kvm_mmu_page *sp,
				2068	u64 *parent_pte)
				2069	{
				2070	mmu_page_remove_parent_pte(sp, parent_pte);
				2071	mmu_spte_clear_no_track(parent_pte);
				2072	}
				2073
				2074	static struct kvm_mmu_page kvm_mmu_alloc_page(struct kvm_vcpu vcpu, int direct)
				2075	{
				2076	struct kvm_mmu_page *sp;
				2077
				2078	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
				2079	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
				2080	if (!direct)
				2081	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
				2082	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
				2083
				2084	/*
				2085	* The active_mmu_pages list is the FIFO list, do not move the
				2086	* page until it is zapped. kvm_zap_obsolete_pages depends on
				2087	* this feature. See the comments in kvm_zap_obsolete_pages().
				2088	*/
				2089	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
				2090	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
				2091	return sp;
				2092	}
				2093
				2094	static void mark_unsync(u64 *spte);
				2095	static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
				2096	{
				2097	u64 *sptep;
				2098	struct rmap_iterator iter;
				2099
				2100	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
				2101	mark_unsync(sptep);
				2102	}
				2103	}
				2104
				2105	static void mark_unsync(u64 *spte)
				2106	{
				2107	struct kvm_mmu_page *sp;
				2108	unsigned int index;
				2109
				2110	sp = page_header(__pa(spte));
				2111	index = spte - sp->spt;
				2112	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
				2113	return;
				2114	if (sp->unsync_children++)
				2115	return;
				2116	kvm_mmu_mark_parents_unsync(sp);
				2117	}
				2118
				2119	static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
				2120	struct kvm_mmu_page *sp)
				2121	{
				2122	return 0;
				2123	}
				2124
				2125	static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
				2126	{
				2127	}
				2128
				2129	static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
				2130	struct kvm_mmu_page sp, u64 spte,
				2131	const void *pte)
				2132	{
				2133	WARN_ON(1);
				2134	}
				2135
				2136	#define KVM_PAGE_ARRAY_NR 16
				2137
				2138	struct kvm_mmu_pages {
				2139	struct mmu_page_and_offset {
				2140	struct kvm_mmu_page *sp;
				2141	unsigned int idx;
				2142	} page[KVM_PAGE_ARRAY_NR];
				2143	unsigned int nr;
				2144	};
				2145
				2146	static int mmu_pages_add(struct kvm_mmu_pages pvec, struct kvm_mmu_page sp,
				2147	int idx)
				2148	{
				2149	int i;
				2150
				2151	if (sp->unsync)
				2152	for (i=0; i < pvec->nr; i++)
				2153	if (pvec->page[i].sp == sp)
				2154	return 0;
				2155
				2156	pvec->page[pvec->nr].sp = sp;
				2157	pvec->page[pvec->nr].idx = idx;
				2158	pvec->nr++;
				2159	return (pvec->nr == KVM_PAGE_ARRAY_NR);
				2160	}
				2161
				2162	static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
				2163	{
				2164	--sp->unsync_children;
				2165	WARN_ON((int)sp->unsync_children < 0);
				2166	__clear_bit(idx, sp->unsync_child_bitmap);
				2167	}
				2168
				2169	static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
				2170	struct kvm_mmu_pages *pvec)
				2171	{
				2172	int i, ret, nr_unsync_leaf = 0;
				2173
				2174	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
				2175	struct kvm_mmu_page *child;
				2176	u64 ent = sp->spt[i];
				2177
				2178	if (!is_shadow_present_pte(ent) \|\| is_large_pte(ent)) {
				2179	clear_unsync_child_bit(sp, i);
				2180	continue;
				2181	}
				2182
				2183	child = page_header(ent & PT64_BASE_ADDR_MASK);
				2184
				2185	if (child->unsync_children) {
				2186	if (mmu_pages_add(pvec, child, i))
				2187	return -ENOSPC;
				2188
				2189	ret = __mmu_unsync_walk(child, pvec);
				2190	if (!ret) {
				2191	clear_unsync_child_bit(sp, i);
				2192	continue;
				2193	} else if (ret > 0) {
				2194	nr_unsync_leaf += ret;
				2195	} else
				2196	return ret;
				2197	} else if (child->unsync) {
				2198	nr_unsync_leaf++;
				2199	if (mmu_pages_add(pvec, child, i))
				2200	return -ENOSPC;
				2201	} else
				2202	clear_unsync_child_bit(sp, i);
				2203	}
				2204
				2205	return nr_unsync_leaf;
				2206	}
				2207
				2208	#define INVALID_INDEX (-1)
				2209
				2210	static int mmu_unsync_walk(struct kvm_mmu_page *sp,
				2211	struct kvm_mmu_pages *pvec)
				2212	{
				2213	pvec->nr = 0;
				2214	if (!sp->unsync_children)
				2215	return 0;
				2216
				2217	mmu_pages_add(pvec, sp, INVALID_INDEX);
				2218	return __mmu_unsync_walk(sp, pvec);
				2219	}
				2220
				2221	static void kvm_unlink_unsync_page(struct kvm kvm, struct kvm_mmu_page sp)
				2222	{
				2223	WARN_ON(!sp->unsync);
				2224	trace_kvm_mmu_sync_page(sp);
				2225	sp->unsync = 0;
				2226	--kvm->stat.mmu_unsync;
				2227	}
				2228
				2229	static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,
				2230	struct list_head *invalid_list);
				2231	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				2232	struct list_head *invalid_list);
				2233
				2234	/*
				2235	* NOTE: we should pay more attention on the zapped-obsolete page
				2236	* (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
				2237	* since it has been deleted from active_mmu_pages but still can be found
				2238	* at hast list.
				2239	*
				2240	* for_each_valid_sp() has skipped that kind of pages.
				2241	*/
				2242	#define for_each_valid_sp(_kvm, _sp, _gfn) \
				2243	hlist_for_each_entry(_sp, \
				2244	&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
				2245	if (is_obsolete_sp((_kvm), (_sp)) \|\| (_sp)->role.invalid) { \
				2246	} else
				2247
				2248	#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
				2249	for_each_valid_sp(_kvm, _sp, _gfn) \
				2250	if ((_sp)->gfn != (_gfn) \|\| (_sp)->role.direct) {} else
				2251
				2252	/* @sp->gfn should be write-protected at the call site */
				2253	static bool __kvm_sync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				2254	struct list_head *invalid_list)
				2255	{
				2256	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
				2257	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
				2258	return false;
				2259	}
				2260
				2261	if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
				2262	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
				2263	return false;
				2264	}
				2265
				2266	return true;
				2267	}
				2268
				2269	static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
				2270	struct list_head *invalid_list,
				2271	bool remote_flush, bool local_flush)
				2272	{
				2273	if (!list_empty(invalid_list)) {
				2274	kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
				2275	return;
				2276	}
				2277
				2278	if (remote_flush)
				2279	kvm_flush_remote_tlbs(vcpu->kvm);
				2280	else if (local_flush)
				2281	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2282	}
				2283
				2284	#ifdef CONFIG_KVM_MMU_AUDIT
				2285	#include "mmu_audit.c"
				2286	#else
				2287	static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
				2288	static void mmu_audit_disable(void) { }
				2289	#endif
				2290
				2291	static bool is_obsolete_sp(struct kvm kvm, struct kvm_mmu_page sp)
				2292	{
				2293	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
				2294	}
				2295
				2296	static bool kvm_sync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				2297	struct list_head *invalid_list)
				2298	{
				2299	kvm_unlink_unsync_page(vcpu->kvm, sp);
				2300	return __kvm_sync_page(vcpu, sp, invalid_list);
				2301	}
				2302
				2303	/* @gfn should be write-protected at the call site */
				2304	static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
				2305	struct list_head *invalid_list)
				2306	{
				2307	struct kvm_mmu_page *s;
				2308	bool ret = false;
				2309
				2310	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
				2311	if (!s->unsync)
				2312	continue;
				2313
				2314	WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
				2315	ret \|= kvm_sync_page(vcpu, s, invalid_list);
				2316	}
				2317
				2318	return ret;
				2319	}
				2320
				2321	struct mmu_page_path {
				2322	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
				2323	unsigned int idx[PT64_ROOT_MAX_LEVEL];
				2324	};
				2325
				2326	#define for_each_sp(pvec, sp, parents, i) \
				2327	for (i = mmu_pages_first(&pvec, &parents); \
				2328	i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
				2329	i = mmu_pages_next(&pvec, &parents, i))
				2330
				2331	static int mmu_pages_next(struct kvm_mmu_pages *pvec,
				2332	struct mmu_page_path *parents,
				2333	int i)
				2334	{
				2335	int n;
				2336
				2337	for (n = i+1; n < pvec->nr; n++) {
				2338	struct kvm_mmu_page *sp = pvec->page[n].sp;
				2339	unsigned idx = pvec->page[n].idx;
				2340	int level = sp->role.level;
				2341
				2342	parents->idx[level-1] = idx;
				2343	if (level == PT_PAGE_TABLE_LEVEL)
				2344	break;
				2345
				2346	parents->parent[level-2] = sp;
				2347	}
				2348
				2349	return n;
				2350	}
				2351
				2352	static int mmu_pages_first(struct kvm_mmu_pages *pvec,
				2353	struct mmu_page_path *parents)
				2354	{
				2355	struct kvm_mmu_page *sp;
				2356	int level;
				2357
				2358	if (pvec->nr == 0)
				2359	return 0;
				2360
				2361	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
				2362
				2363	sp = pvec->page[0].sp;
				2364	level = sp->role.level;
				2365	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
				2366
				2367	parents->parent[level-2] = sp;
				2368
				2369	/* Also set up a sentinel. Further entries in pvec are all
				2370	* children of sp, so this element is never overwritten.
				2371	*/
				2372	parents->parent[level-1] = NULL;
				2373	return mmu_pages_next(pvec, parents, 0);
				2374	}
				2375
				2376	static void mmu_pages_clear_parents(struct mmu_page_path *parents)
				2377	{
				2378	struct kvm_mmu_page *sp;
				2379	unsigned int level = 0;
				2380
				2381	do {
				2382	unsigned int idx = parents->idx[level];
				2383	sp = parents->parent[level];
				2384	if (!sp)
				2385	return;
				2386
				2387	WARN_ON(idx == INVALID_INDEX);
				2388	clear_unsync_child_bit(sp, idx);
				2389	level++;
				2390	} while (!sp->unsync_children);
				2391	}
				2392
				2393	static void mmu_sync_children(struct kvm_vcpu *vcpu,
				2394	struct kvm_mmu_page *parent)
				2395	{
				2396	int i;
				2397	struct kvm_mmu_page *sp;
				2398	struct mmu_page_path parents;
				2399	struct kvm_mmu_pages pages;
				2400	LIST_HEAD(invalid_list);
				2401	bool flush = false;
				2402
				2403	while (mmu_unsync_walk(parent, &pages)) {
				2404	bool protected = false;
				2405
				2406	for_each_sp(pages, sp, parents, i)
				2407	protected \|= rmap_write_protect(vcpu, sp->gfn);
				2408
				2409	if (protected) {
				2410	kvm_flush_remote_tlbs(vcpu->kvm);
				2411	flush = false;
				2412	}
				2413
				2414	for_each_sp(pages, sp, parents, i) {
				2415	flush \|= kvm_sync_page(vcpu, sp, &invalid_list);
				2416	mmu_pages_clear_parents(&parents);
				2417	}
				2418	if (need_resched() \|\| spin_needbreak(&vcpu->kvm->mmu_lock)) {
				2419	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2420	cond_resched_lock(&vcpu->kvm->mmu_lock);
				2421	flush = false;
				2422	}
				2423	}
				2424
				2425	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2426	}
				2427
				2428	static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
				2429	{
				2430	atomic_set(&sp->write_flooding_count, 0);
				2431	}
				2432
				2433	static void clear_sp_write_flooding_count(u64 *spte)
				2434	{
				2435	struct kvm_mmu_page *sp = page_header(__pa(spte));
				2436
				2437	__clear_sp_write_flooding_count(sp);
				2438	}
				2439
				2440	static struct kvm_mmu_page kvm_mmu_get_page(struct kvm_vcpu vcpu,
				2441	gfn_t gfn,
				2442	gva_t gaddr,
				2443	unsigned level,
				2444	int direct,
				2445	unsigned access)
				2446	{
				2447	union kvm_mmu_page_role role;
				2448	unsigned quadrant;
				2449	struct kvm_mmu_page *sp;
				2450	bool need_sync = false;
				2451	bool flush = false;
				2452	int collisions = 0;
				2453	LIST_HEAD(invalid_list);
				2454
				2455	role = vcpu->arch.mmu.base_role;
				2456	role.level = level;
				2457	role.direct = direct;
				2458	if (role.direct)
				2459	role.cr4_pae = 0;
				2460	role.access = access;
				2461	if (!vcpu->arch.mmu.direct_map
				2462	&& vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
				2463	quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
				2464	quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
				2465	role.quadrant = quadrant;
				2466	}
				2467	for_each_valid_sp(vcpu->kvm, sp, gfn) {
				2468	if (sp->gfn != gfn) {
				2469	collisions++;
				2470	continue;
				2471	}
				2472
				2473	if (!need_sync && sp->unsync)
				2474	need_sync = true;
				2475
				2476	if (sp->role.word != role.word)
				2477	continue;
				2478
				2479	if (sp->unsync) {
				2480	/* The page is good, but __kvm_sync_page might still end
				2481	* up zapping it. If so, break in order to rebuild it.
				2482	*/
				2483	if (!__kvm_sync_page(vcpu, sp, &invalid_list))
				2484	break;
				2485
				2486	WARN_ON(!list_empty(&invalid_list));
				2487	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2488	}
				2489
				2490	if (sp->unsync_children)
				2491	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
				2492
				2493	__clear_sp_write_flooding_count(sp);
				2494	trace_kvm_mmu_get_page(sp, false);
				2495	goto out;
				2496	}
				2497
				2498	++vcpu->kvm->stat.mmu_cache_miss;
				2499
				2500	sp = kvm_mmu_alloc_page(vcpu, direct);
				2501
				2502	sp->gfn = gfn;
				2503	sp->role = role;
				2504	hlist_add_head(&sp->hash_link,
				2505	&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
				2506	if (!direct) {
				2507	/*
				2508	* we should do write protection before syncing pages
				2509	* otherwise the content of the synced shadow page may
				2510	* be inconsistent with guest page table.
				2511	*/
				2512	account_shadowed(vcpu->kvm, sp);
				2513	if (level == PT_PAGE_TABLE_LEVEL &&
				2514	rmap_write_protect(vcpu, gfn))
				2515	kvm_flush_remote_tlbs(vcpu->kvm);
				2516
				2517	if (level > PT_PAGE_TABLE_LEVEL && need_sync)
				2518	flush \|= kvm_sync_pages(vcpu, gfn, &invalid_list);
				2519	}
				2520	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
				2521	clear_page(sp->spt);
				2522	trace_kvm_mmu_get_page(sp, true);
				2523
				2524	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2525	out:
				2526	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
				2527	vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
				2528	return sp;
				2529	}
				2530
				2531	static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
				2532	struct kvm_vcpu *vcpu, u64 addr)
				2533	{
				2534	iterator->addr = addr;
				2535	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
				2536	iterator->level = vcpu->arch.mmu.shadow_root_level;
				2537
				2538	if (iterator->level == PT64_ROOT_4LEVEL &&
				2539	vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
				2540	!vcpu->arch.mmu.direct_map)
				2541	--iterator->level;
				2542
				2543	if (iterator->level == PT32E_ROOT_LEVEL) {
				2544	iterator->shadow_addr
				2545	= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
				2546	iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
				2547	--iterator->level;
				2548	if (!iterator->shadow_addr)
				2549	iterator->level = 0;
				2550	}
				2551	}
				2552
				2553	static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
				2554	{
				2555	if (iterator->level < PT_PAGE_TABLE_LEVEL)
				2556	return false;
				2557
				2558	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
				2559	iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
				2560	return true;
				2561	}
				2562
				2563	static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
				2564	u64 spte)
				2565	{
				2566	if (is_last_spte(spte, iterator->level)) {
				2567	iterator->level = 0;
				2568	return;
				2569	}
				2570
				2571	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
				2572	--iterator->level;
				2573	}
				2574
				2575	static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
				2576	{
				2577	return __shadow_walk_next(iterator, *iterator->sptep);
				2578	}
				2579
				2580	static void link_shadow_page(struct kvm_vcpu vcpu, u64 sptep,
				2581	struct kvm_mmu_page *sp)
				2582	{
				2583	u64 spte;
				2584
				2585	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
				2586
				2587	spte = __pa(sp->spt) \| shadow_present_mask \| PT_WRITABLE_MASK \|
				2588	shadow_user_mask \| shadow_x_mask \| shadow_me_mask;
				2589
				2590	if (sp_ad_disabled(sp))
				2591	spte \|= shadow_acc_track_value;
				2592	else
				2593	spte \|= shadow_accessed_mask;
				2594
				2595	mmu_spte_set(sptep, spte);
				2596
				2597	mmu_page_add_parent_pte(vcpu, sp, sptep);
				2598
				2599	if (sp->unsync_children \|\| sp->unsync)
				2600	mark_unsync(sptep);
				2601	}
				2602
				2603	static void validate_direct_spte(struct kvm_vcpu vcpu, u64 sptep,
				2604	unsigned direct_access)
				2605	{
				2606	if (is_shadow_present_pte(sptep) && !is_large_pte(sptep)) {
				2607	struct kvm_mmu_page *child;
				2608
				2609	/*
				2610	* For the direct sp, if the guest pte's dirty bit
				2611	* changed form clean to dirty, it will corrupt the
				2612	* sp's access: allow writable in the read-only sp,
				2613	* so we should update the spte at this point to get
				2614	* a new sp with the correct access.
				2615	*/
				2616	child = page_header(*sptep & PT64_BASE_ADDR_MASK);
				2617	if (child->role.access == direct_access)
				2618	return;
				2619
				2620	drop_parent_pte(child, sptep);
				2621	kvm_flush_remote_tlbs(vcpu->kvm);
				2622	}
				2623	}
				2624
				2625	static bool mmu_page_zap_pte(struct kvm kvm, struct kvm_mmu_page sp,
				2626	u64 *spte)
				2627	{
				2628	u64 pte;
				2629	struct kvm_mmu_page *child;
				2630
				2631	pte = *spte;
				2632	if (is_shadow_present_pte(pte)) {
				2633	if (is_last_spte(pte, sp->role.level)) {
				2634	drop_spte(kvm, spte);
				2635	if (is_large_pte(pte))
				2636	--kvm->stat.lpages;
				2637	} else {
				2638	child = page_header(pte & PT64_BASE_ADDR_MASK);
				2639	drop_parent_pte(child, spte);
				2640	}
				2641	return true;
				2642	}
				2643
				2644	if (is_mmio_spte(pte))
				2645	mmu_spte_clear_no_track(spte);
				2646
				2647	return false;
				2648	}
				2649
				2650	static void kvm_mmu_page_unlink_children(struct kvm *kvm,
				2651	struct kvm_mmu_page *sp)
				2652	{
				2653	unsigned i;
				2654
				2655	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
				2656	mmu_page_zap_pte(kvm, sp, sp->spt + i);
				2657	}
				2658
				2659	static void kvm_mmu_unlink_parents(struct kvm kvm, struct kvm_mmu_page sp)
				2660	{
				2661	u64 *sptep;
				2662	struct rmap_iterator iter;
				2663
				2664	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
				2665	drop_parent_pte(sp, sptep);
				2666	}
				2667
				2668	static int mmu_zap_unsync_children(struct kvm *kvm,
				2669	struct kvm_mmu_page *parent,
				2670	struct list_head *invalid_list)
				2671	{
				2672	int i, zapped = 0;
				2673	struct mmu_page_path parents;
				2674	struct kvm_mmu_pages pages;
				2675
				2676	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
				2677	return 0;
				2678
				2679	while (mmu_unsync_walk(parent, &pages)) {
				2680	struct kvm_mmu_page *sp;
				2681
				2682	for_each_sp(pages, sp, parents, i) {
				2683	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				2684	mmu_pages_clear_parents(&parents);
				2685	zapped++;
				2686	}
				2687	}
				2688
				2689	return zapped;
				2690	}
				2691
				2692	static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,
				2693	struct list_head *invalid_list)
				2694	{
				2695	int ret;
				2696
				2697	trace_kvm_mmu_prepare_zap_page(sp);
				2698	++kvm->stat.mmu_shadow_zapped;
				2699	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
				2700	kvm_mmu_page_unlink_children(kvm, sp);
				2701	kvm_mmu_unlink_parents(kvm, sp);
				2702
				2703	if (!sp->role.invalid && !sp->role.direct)
				2704	unaccount_shadowed(kvm, sp);
				2705
				2706	if (sp->unsync)
				2707	kvm_unlink_unsync_page(kvm, sp);
				2708	if (!sp->root_count) {
				2709	/* Count self */
				2710	ret++;
				2711	list_move(&sp->link, invalid_list);
				2712	kvm_mod_used_mmu_pages(kvm, -1);
				2713	} else {
				2714	list_move(&sp->link, &kvm->arch.active_mmu_pages);
				2715
				2716	/*
				2717	* The obsolete pages can not be used on any vcpus.
				2718	* See the comments in kvm_mmu_invalidate_zap_all_pages().
				2719	*/
				2720	if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
				2721	kvm_reload_remote_mmus(kvm);
				2722	}
				2723
				2724	if (sp->lpage_disallowed)
				2725	unaccount_huge_nx_page(kvm, sp);
				2726
				2727	sp->role.invalid = 1;
				2728	return ret;
				2729	}
				2730
				2731	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				2732	struct list_head *invalid_list)
				2733	{
				2734	struct kvm_mmu_page sp, nsp;
				2735
				2736	if (list_empty(invalid_list))
				2737	return;
				2738
				2739	/*
				2740	* We need to make sure everyone sees our modifications to
				2741	* the page tables and see changes to vcpu->mode here. The barrier
				2742	* in the kvm_flush_remote_tlbs() achieves this. This pairs
				2743	* with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
				2744	*
				2745	* In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
				2746	* guest mode and/or lockless shadow page table walks.
				2747	*/
				2748	kvm_flush_remote_tlbs(kvm);
				2749
				2750	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
				2751	WARN_ON(!sp->role.invalid \|\| sp->root_count);
				2752	kvm_mmu_free_page(sp);
				2753	}
				2754	}
				2755
				2756	static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
				2757	struct list_head *invalid_list)
				2758	{
				2759	struct kvm_mmu_page *sp;
				2760
				2761	if (list_empty(&kvm->arch.active_mmu_pages))
				2762	return false;
				2763
				2764	sp = list_last_entry(&kvm->arch.active_mmu_pages,
				2765	struct kvm_mmu_page, link);
				2766	return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				2767	}
				2768
				2769	/*
				2770	* Changing the number of mmu pages allocated to the vm
				2771	* Note: if goal_nr_mmu_pages is too small, you will get dead lock
				2772	*/
				2773	void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
				2774	{
				2775	LIST_HEAD(invalid_list);
				2776
				2777	spin_lock(&kvm->mmu_lock);
				2778
				2779	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
				2780	/* Need to free some mmu pages to achieve the goal. */
				2781	while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
				2782	if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
				2783	break;
				2784
				2785	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				2786	goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
				2787	}
				2788
				2789	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
				2790
				2791	spin_unlock(&kvm->mmu_lock);
				2792	}
				2793
				2794	int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
				2795	{
				2796	struct kvm_mmu_page *sp;
				2797	LIST_HEAD(invalid_list);
				2798	int r;
				2799
				2800	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
				2801	r = 0;
				2802	spin_lock(&kvm->mmu_lock);
				2803	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
				2804	pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
				2805	sp->role.word);
				2806	r = 1;
				2807	kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
				2808	}
				2809	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				2810	spin_unlock(&kvm->mmu_lock);
				2811
				2812	return r;
				2813	}
				2814	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
				2815
				2816	static void kvm_unsync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
				2817	{
				2818	trace_kvm_mmu_unsync_page(sp);
				2819	++vcpu->kvm->stat.mmu_unsync;
				2820	sp->unsync = 1;
				2821
				2822	kvm_mmu_mark_parents_unsync(sp);
				2823	}
				2824
				2825	static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
				2826	bool can_unsync)
				2827	{
				2828	struct kvm_mmu_page *sp;
				2829
				2830	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
				2831	return true;
				2832
				2833	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
				2834	if (!can_unsync)
				2835	return true;
				2836
				2837	if (sp->unsync)
				2838	continue;
				2839
				2840	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
				2841	kvm_unsync_page(vcpu, sp);
				2842	}
				2843
				2844	return false;
				2845	}
				2846
				2847	static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
				2848	{
				2849	if (pfn_valid(pfn))
				2850	return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
				2851
				2852	return true;
				2853	}
				2854
				2855	static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
				2856	unsigned pte_access, int level,
				2857	gfn_t gfn, kvm_pfn_t pfn, bool speculative,
				2858	bool can_unsync, bool host_writable)
				2859	{
				2860	u64 spte = 0;
				2861	int ret = 0;
				2862	struct kvm_mmu_page *sp;
				2863
				2864	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
				2865	return 0;
				2866
				2867	sp = page_header(__pa(sptep));
				2868	if (sp_ad_disabled(sp))
				2869	spte \|= shadow_acc_track_value;
				2870
				2871	/*
				2872	* For the EPT case, shadow_present_mask is 0 if hardware
				2873	* supports exec-only page table entries. In that case,
				2874	* ACC_USER_MASK and shadow_user_mask are used to represent
				2875	* read access. See FNAME(gpte_access) in paging_tmpl.h.
				2876	*/
				2877	spte \|= shadow_present_mask;
				2878	if (!speculative)
				2879	spte \|= spte_shadow_accessed_mask(spte);
				2880
				2881	if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
				2882	is_nx_huge_page_enabled()) {
				2883	pte_access &= ~ACC_EXEC_MASK;
				2884	}
				2885
				2886	if (pte_access & ACC_EXEC_MASK)
				2887	spte \|= shadow_x_mask;
				2888	else
				2889	spte \|= shadow_nx_mask;
				2890
				2891	if (pte_access & ACC_USER_MASK)
				2892	spte \|= shadow_user_mask;
				2893
				2894	if (level > PT_PAGE_TABLE_LEVEL)
				2895	spte \|= PT_PAGE_SIZE_MASK;
				2896	if (tdp_enabled)
				2897	spte \|= kvm_x86_ops->get_mt_mask(vcpu, gfn,
				2898	kvm_is_mmio_pfn(pfn));
				2899
				2900	if (host_writable)
				2901	spte \|= SPTE_HOST_WRITEABLE;
				2902	else
				2903	pte_access &= ~ACC_WRITE_MASK;
				2904
				2905	if (!kvm_is_mmio_pfn(pfn))
				2906	spte \|= shadow_me_mask;
				2907
				2908	spte \|= (u64)pfn << PAGE_SHIFT;
				2909
				2910	if (pte_access & ACC_WRITE_MASK) {
				2911
				2912	/*
				2913	* Other vcpu creates new sp in the window between
				2914	* mapping_level() and acquiring mmu-lock. We can
				2915	* allow guest to retry the access, the mapping can
				2916	* be fixed if guest refault.
				2917	*/
				2918	if (level > PT_PAGE_TABLE_LEVEL &&
				2919	mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
				2920	goto done;
				2921
				2922	spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;
				2923
				2924	/*
				2925	* Optimization: for pte sync, if spte was writable the hash
				2926	* lookup is unnecessary (and expensive). Write protection
				2927	* is responsibility of mmu_get_page / kvm_sync_page.
				2928	* Same reasoning can be applied to dirty page accounting.
				2929	*/
				2930	if (!can_unsync && is_writable_pte(*sptep))
				2931	goto set_pte;
				2932
				2933	if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
				2934	pgprintk("%s: found shadow page for %llx, marking ro\n",
				2935	__func__, gfn);
				2936	ret = 1;
				2937	pte_access &= ~ACC_WRITE_MASK;
				2938	spte &= ~(PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE);
				2939	}
				2940	}
				2941
				2942	if (pte_access & ACC_WRITE_MASK) {
				2943	kvm_vcpu_mark_page_dirty(vcpu, gfn);
				2944	spte \|= spte_shadow_dirty_mask(spte);
				2945	}
				2946
				2947	if (speculative)
				2948	spte = mark_spte_for_access_track(spte);
				2949
				2950	set_pte:
				2951	if (mmu_spte_update(sptep, spte))
				2952	kvm_flush_remote_tlbs(vcpu->kvm);
				2953	done:
				2954	return ret;
				2955	}
				2956
				2957	static int mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep, unsigned pte_access,
				2958	int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
				2959	bool speculative, bool host_writable)
				2960	{
				2961	int was_rmapped = 0;
				2962	int rmap_count;
				2963	int ret = RET_PF_RETRY;
				2964
				2965	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
				2966	*sptep, write_fault, gfn);
				2967
				2968	if (is_shadow_present_pte(*sptep)) {
				2969	/*
				2970	* If we overwrite a PTE page pointer with a 2MB PMD, unlink
				2971	* the parent of the now unreachable PTE.
				2972	*/
				2973	if (level > PT_PAGE_TABLE_LEVEL &&
				2974	!is_large_pte(*sptep)) {
				2975	struct kvm_mmu_page *child;
				2976	u64 pte = *sptep;
				2977
				2978	child = page_header(pte & PT64_BASE_ADDR_MASK);
				2979	drop_parent_pte(child, sptep);
				2980	kvm_flush_remote_tlbs(vcpu->kvm);
				2981	} else if (pfn != spte_to_pfn(*sptep)) {
				2982	pgprintk("hfn old %llx new %llx\n",
				2983	spte_to_pfn(*sptep), pfn);
				2984	drop_spte(vcpu->kvm, sptep);
				2985	kvm_flush_remote_tlbs(vcpu->kvm);
				2986	} else
				2987	was_rmapped = 1;
				2988	}
				2989
				2990	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
				2991	true, host_writable)) {
				2992	if (write_fault)
				2993	ret = RET_PF_EMULATE;
				2994	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2995	}
				2996
				2997	if (unlikely(is_mmio_spte(*sptep)))
				2998	ret = RET_PF_EMULATE;
				2999
				3000	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
				3001	trace_kvm_mmu_set_spte(level, gfn, sptep);
				3002	if (!was_rmapped && is_large_pte(*sptep))
				3003	++vcpu->kvm->stat.lpages;
				3004
				3005	if (is_shadow_present_pte(*sptep)) {
				3006	if (!was_rmapped) {
				3007	rmap_count = rmap_add(vcpu, sptep, gfn);
				3008	if (rmap_count > RMAP_RECYCLE_THRESHOLD)
				3009	rmap_recycle(vcpu, sptep, gfn);
				3010	}
				3011	}
				3012
				3013	return ret;
				3014	}
				3015
				3016	static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
				3017	bool no_dirty_log)
				3018	{
				3019	struct kvm_memory_slot *slot;
				3020
				3021	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
				3022	if (!slot)
				3023	return KVM_PFN_ERR_FAULT;
				3024
				3025	return gfn_to_pfn_memslot_atomic(slot, gfn);
				3026	}
				3027
				3028	static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
				3029	struct kvm_mmu_page *sp,
				3030	u64 start, u64 end)
				3031	{
				3032	struct page *pages[PTE_PREFETCH_NUM];
				3033	struct kvm_memory_slot *slot;
				3034	unsigned access = sp->role.access;
				3035	int i, ret;
				3036	gfn_t gfn;
				3037
				3038	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
				3039	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
				3040	if (!slot)
				3041	return -1;
				3042
				3043	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
				3044	if (ret <= 0)
				3045	return -1;
				3046
				3047	for (i = 0; i < ret; i++, gfn++, start++) {
				3048	mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
				3049	page_to_pfn(pages[i]), true, true);
				3050	put_page(pages[i]);
				3051	}
				3052
				3053	return 0;
				3054	}
				3055
				3056	static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
				3057	struct kvm_mmu_page sp, u64 sptep)
				3058	{
				3059	u64 spte, start = NULL;
				3060	int i;
				3061
				3062	WARN_ON(!sp->role.direct);
				3063
				3064	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
				3065	spte = sp->spt + i;
				3066
				3067	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
				3068	if (is_shadow_present_pte(*spte) \|\| spte == sptep) {
				3069	if (!start)
				3070	continue;
				3071	if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
				3072	break;
				3073	start = NULL;
				3074	} else if (!start)
				3075	start = spte;
				3076	}
				3077	}
				3078
				3079	static void direct_pte_prefetch(struct kvm_vcpu vcpu, u64 sptep)
				3080	{
				3081	struct kvm_mmu_page *sp;
				3082
				3083	sp = page_header(__pa(sptep));
				3084
				3085	/*
				3086	* Without accessed bits, there's no way to distinguish between
				3087	* actually accessed translations and prefetched, so disable pte
				3088	* prefetch if accessed bits aren't available.
				3089	*/
				3090	if (sp_ad_disabled(sp))
				3091	return;
				3092
				3093	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				3094	return;
				3095
				3096	__direct_pte_prefetch(vcpu, sp, sptep);
				3097	}
				3098
				3099	static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
				3100	gfn_t gfn, kvm_pfn_t pfnp, int levelp)
				3101	{
				3102	int level = *levelp;
				3103	u64 spte = *it.sptep;
				3104
				3105	if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
				3106	is_nx_huge_page_enabled() &&
				3107	is_shadow_present_pte(spte) &&
				3108	!is_large_pte(spte)) {
				3109	/*
				3110	* A small SPTE exists for this pfn, but FNAME(fetch)
				3111	* and __direct_map would like to create a large PTE
				3112	* instead: just force them to go down another level,
				3113	* patching back for them into pfn the next 9 bits of
				3114	* the address.
				3115	*/
				3116	u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
				3117	*pfnp \|= gfn & page_mask;
				3118	(*levelp)--;
				3119	}
				3120	}
				3121
				3122	static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
				3123	int map_writable, int level, kvm_pfn_t pfn,
				3124	bool prefault, bool lpage_disallowed)
				3125	{
				3126	struct kvm_shadow_walk_iterator it;
				3127	struct kvm_mmu_page *sp;
				3128	int ret;
				3129	gfn_t gfn = gpa >> PAGE_SHIFT;
				3130	gfn_t base_gfn = gfn;
				3131
				3132	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3133	return RET_PF_RETRY;
				3134
				3135	trace_kvm_mmu_spte_requested(gpa, level, pfn);
				3136	for_each_shadow_entry(vcpu, gpa, it) {
				3137	/*
				3138	* We cannot overwrite existing page tables with an NX
				3139	* large page, as the leaf could be executable.
				3140	*/
				3141	disallowed_hugepage_adjust(it, gfn, &pfn, &level);
				3142
				3143	base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
				3144	if (it.level == level)
				3145	break;
				3146
				3147	drop_large_spte(vcpu, it.sptep);
				3148	if (!is_shadow_present_pte(*it.sptep)) {
				3149	sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
				3150	it.level - 1, true, ACC_ALL);
				3151
				3152	link_shadow_page(vcpu, it.sptep, sp);
				3153	if (lpage_disallowed)
				3154	account_huge_nx_page(vcpu->kvm, sp);
				3155	}
				3156	}
				3157
				3158	ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
				3159	write, level, base_gfn, pfn, prefault,
				3160	map_writable);
				3161	direct_pte_prefetch(vcpu, it.sptep);
				3162	++vcpu->stat.pf_fixed;
				3163	return ret;
				3164	}
				3165
				3166	static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
				3167	{
				3168	siginfo_t info;
				3169
				3170	info.si_signo = SIGBUS;
				3171	info.si_errno = 0;
				3172	info.si_code = BUS_MCEERR_AR;
				3173	info.si_addr = (void __user *)address;
				3174	info.si_addr_lsb = PAGE_SHIFT;
				3175
				3176	send_sig_info(SIGBUS, &info, tsk);
				3177	}
				3178
				3179	static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
				3180	{
				3181	/*
				3182	* Do not cache the mmio info caused by writing the readonly gfn
				3183	* into the spte otherwise read access on readonly gfn also can
				3184	* caused mmio page fault and treat it as mmio access.
				3185	*/
				3186	if (pfn == KVM_PFN_ERR_RO_FAULT)
				3187	return RET_PF_EMULATE;
				3188
				3189	if (pfn == KVM_PFN_ERR_HWPOISON) {
				3190	kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
				3191	return RET_PF_RETRY;
				3192	}
				3193
				3194	return -EFAULT;
				3195	}
				3196
				3197	static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
				3198	gfn_t gfn, kvm_pfn_t *pfnp,
				3199	int *levelp)
				3200	{
				3201	kvm_pfn_t pfn = *pfnp;
				3202	int level = *levelp;
				3203
				3204	/*
				3205	* Check if it's a transparent hugepage. If this would be an
				3206	* hugetlbfs page, level wouldn't be set to
				3207	* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
				3208	* here.
				3209	*/
				3210	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
				3211	!kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
				3212	PageTransCompoundMap(pfn_to_page(pfn)) &&
				3213	!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
				3214	unsigned long mask;
				3215	/*
				3216	* mmu_notifier_retry was successful and we hold the
				3217	* mmu_lock here, so the pmd can't become splitting
				3218	* from under us, and in turn
				3219	* __split_huge_page_refcount() can't run from under
				3220	* us and we can safely transfer the refcount from
				3221	* PG_tail to PG_head as we switch the pfn to tail to
				3222	* head.
				3223	*/
				3224	*levelp = level = PT_DIRECTORY_LEVEL;
				3225	mask = KVM_PAGES_PER_HPAGE(level) - 1;
				3226	VM_BUG_ON((gfn & mask) != (pfn & mask));
				3227	if (pfn & mask) {
				3228	kvm_release_pfn_clean(pfn);
				3229	pfn &= ~mask;
				3230	kvm_get_pfn(pfn);
				3231	*pfnp = pfn;
				3232	}
				3233	}
				3234	}
				3235
				3236	static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
				3237	kvm_pfn_t pfn, unsigned access, int *ret_val)
				3238	{
				3239	/* The pfn is invalid, report the error! */
				3240	if (unlikely(is_error_pfn(pfn))) {
				3241	*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
				3242	return true;
				3243	}
				3244
				3245	if (unlikely(is_noslot_pfn(pfn)))
				3246	vcpu_cache_mmio_info(vcpu, gva, gfn, access);
				3247
				3248	return false;
				3249	}
				3250
				3251	static bool page_fault_can_be_fast(u32 error_code)
				3252	{
				3253	/*
				3254	* Do not fix the mmio spte with invalid generation number which
				3255	* need to be updated by slow page fault path.
				3256	*/
				3257	if (unlikely(error_code & PFERR_RSVD_MASK))
				3258	return false;
				3259
				3260	/* See if the page fault is due to an NX violation */
				3261	if (unlikely(((error_code & (PFERR_FETCH_MASK \| PFERR_PRESENT_MASK))
				3262	== (PFERR_FETCH_MASK \| PFERR_PRESENT_MASK))))
				3263	return false;
				3264
				3265	/*
				3266	* #PF can be fast if:
				3267	* 1. The shadow page table entry is not present, which could mean that
				3268	* the fault is potentially caused by access tracking (if enabled).
				3269	* 2. The shadow page table entry is present and the fault
				3270	* is caused by write-protect, that means we just need change the W
				3271	* bit of the spte which can be done out of mmu-lock.
				3272	*
				3273	* However, if access tracking is disabled we know that a non-present
				3274	* page must be a genuine page fault where we have to create a new SPTE.
				3275	* So, if access tracking is disabled, we return true only for write
				3276	* accesses to a present page.
				3277	*/
				3278
				3279	return shadow_acc_track_mask != 0 \|\|
				3280	((error_code & (PFERR_WRITE_MASK \| PFERR_PRESENT_MASK))
				3281	== (PFERR_WRITE_MASK \| PFERR_PRESENT_MASK));
				3282	}
				3283
				3284	/*
				3285	* Returns true if the SPTE was fixed successfully. Otherwise,
				3286	* someone else modified the SPTE from its original value.
				3287	*/
				3288	static bool
				3289	fast_pf_fix_direct_spte(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				3290	u64 *sptep, u64 old_spte, u64 new_spte)
				3291	{
				3292	gfn_t gfn;
				3293
				3294	WARN_ON(!sp->role.direct);
				3295
				3296	/*
				3297	* Theoretically we could also set dirty bit (and flush TLB) here in
				3298	* order to eliminate unnecessary PML logging. See comments in
				3299	* set_spte. But fast_page_fault is very unlikely to happen with PML
				3300	* enabled, so we do not do this. This might result in the same GPA
				3301	* to be logged in PML buffer again when the write really happens, and
				3302	* eventually to be called by mark_page_dirty twice. But it's also no
				3303	* harm. This also avoids the TLB flush needed after setting dirty bit
				3304	* so non-PML cases won't be impacted.
				3305	*
				3306	* Compare with set_spte where instead shadow_dirty_mask is set.
				3307	*/
				3308	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
				3309	return false;
				3310
				3311	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
				3312	/*
				3313	* The gfn of direct spte is stable since it is
				3314	* calculated by sp->gfn.
				3315	*/
				3316	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
				3317	kvm_vcpu_mark_page_dirty(vcpu, gfn);
				3318	}
				3319
				3320	return true;
				3321	}
				3322
				3323	static bool is_access_allowed(u32 fault_err_code, u64 spte)
				3324	{
				3325	if (fault_err_code & PFERR_FETCH_MASK)
				3326	return is_executable_pte(spte);
				3327
				3328	if (fault_err_code & PFERR_WRITE_MASK)
				3329	return is_writable_pte(spte);
				3330
				3331	/* Fault was on Read access */
				3332	return spte & PT_PRESENT_MASK;
				3333	}
				3334
				3335	/*
				3336	* Return value:
				3337	* - true: let the vcpu to access on the same address again.
				3338	* - false: let the real page fault path to fix it.
				3339	*/
				3340	static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
				3341	u32 error_code)
				3342	{
				3343	struct kvm_shadow_walk_iterator iterator;
				3344	struct kvm_mmu_page *sp;
				3345	bool fault_handled = false;
				3346	u64 spte = 0ull;
				3347	uint retry_count = 0;
				3348
				3349	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3350	return false;
				3351
				3352	if (!page_fault_can_be_fast(error_code))
				3353	return false;
				3354
				3355	walk_shadow_page_lockless_begin(vcpu);
				3356
				3357	do {
				3358	u64 new_spte;
				3359
				3360	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
				3361	if (!is_shadow_present_pte(spte) \|\|
				3362	iterator.level < level)
				3363	break;
				3364
				3365	sp = page_header(__pa(iterator.sptep));
				3366	if (!is_last_spte(spte, sp->role.level))
				3367	break;
				3368
				3369	/*
				3370	* Check whether the memory access that caused the fault would
				3371	* still cause it if it were to be performed right now. If not,
				3372	* then this is a spurious fault caused by TLB lazily flushed,
				3373	* or some other CPU has already fixed the PTE after the
				3374	* current CPU took the fault.
				3375	*
				3376	* Need not check the access of upper level table entries since
				3377	* they are always ACC_ALL.
				3378	*/
				3379	if (is_access_allowed(error_code, spte)) {
				3380	fault_handled = true;
				3381	break;
				3382	}
				3383
				3384	new_spte = spte;
				3385
				3386	if (is_access_track_spte(spte))
				3387	new_spte = restore_acc_track_spte(new_spte);
				3388
				3389	/*
				3390	* Currently, to simplify the code, write-protection can
				3391	* be removed in the fast path only if the SPTE was
				3392	* write-protected for dirty-logging or access tracking.
				3393	*/
				3394	if ((error_code & PFERR_WRITE_MASK) &&
				3395	spte_can_locklessly_be_made_writable(spte))
				3396	{
				3397	new_spte \|= PT_WRITABLE_MASK;
				3398
				3399	/*
				3400	* Do not fix write-permission on the large spte. Since
				3401	* we only dirty the first page into the dirty-bitmap in
				3402	* fast_pf_fix_direct_spte(), other pages are missed
				3403	* if its slot has dirty logging enabled.
				3404	*
				3405	* Instead, we let the slow page fault path create a
				3406	* normal spte to fix the access.
				3407	*
				3408	* See the comments in kvm_arch_commit_memory_region().
				3409	*/
				3410	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				3411	break;
				3412	}
				3413
				3414	/* Verify that the fault can be handled in the fast path */
				3415	if (new_spte == spte \|\|
				3416	!is_access_allowed(error_code, new_spte))
				3417	break;
				3418
				3419	/*
				3420	* Currently, fast page fault only works for direct mapping
				3421	* since the gfn is not stable for indirect shadow page. See
				3422	* Documentation/virtual/kvm/locking.txt to get more detail.
				3423	*/
				3424	fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
				3425	iterator.sptep, spte,
				3426	new_spte);
				3427	if (fault_handled)
				3428	break;
				3429
				3430	if (++retry_count > 4) {
				3431	printk_once(KERN_WARNING
				3432	"kvm: Fast #PF retrying more than 4 times.\n");
				3433	break;
				3434	}
				3435
				3436	} while (true);
				3437
				3438	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
				3439	spte, fault_handled);
				3440	walk_shadow_page_lockless_end(vcpu);
				3441
				3442	return fault_handled;
				3443	}
				3444
				3445	static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
				3446	gva_t gva, kvm_pfn_t pfn, bool write, bool writable);
				3447	static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
				3448
				3449	static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
				3450	gfn_t gfn, bool prefault)
				3451	{
				3452	int r;
				3453	int level;
				3454	bool force_pt_level;
				3455	kvm_pfn_t pfn;
				3456	unsigned long mmu_seq;
				3457	bool map_writable, write = error_code & PFERR_WRITE_MASK;
				3458	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
				3459	is_nx_huge_page_enabled();
				3460
				3461	force_pt_level = lpage_disallowed;
				3462	level = mapping_level(vcpu, gfn, &force_pt_level);
				3463	if (likely(!force_pt_level)) {
				3464	/*
				3465	* This path builds a PAE pagetable - so we can map
				3466	* 2mb pages at maximum. Therefore check if the level
				3467	* is larger than that.
				3468	*/
				3469	if (level > PT_DIRECTORY_LEVEL)
				3470	level = PT_DIRECTORY_LEVEL;
				3471
				3472	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
				3473	}
				3474
				3475	if (fast_page_fault(vcpu, v, level, error_code))
				3476	return RET_PF_RETRY;
				3477
				3478	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				3479	smp_rmb();
				3480
				3481	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
				3482	return RET_PF_RETRY;
				3483
				3484	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
				3485	return r;
				3486
				3487	r = RET_PF_RETRY;
				3488	spin_lock(&vcpu->kvm->mmu_lock);
				3489	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				3490	goto out_unlock;
				3491	if (make_mmu_pages_available(vcpu) < 0)
				3492	goto out_unlock;
				3493	if (likely(!force_pt_level))
				3494	transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
				3495	r = __direct_map(vcpu, v, write, map_writable, level, pfn,
				3496	prefault, false);
				3497	out_unlock:
				3498	spin_unlock(&vcpu->kvm->mmu_lock);
				3499	kvm_release_pfn_clean(pfn);
				3500	return r;
				3501	}
				3502
				3503
				3504	static void mmu_free_roots(struct kvm_vcpu *vcpu)
				3505	{
				3506	int i;
				3507	struct kvm_mmu_page *sp;
				3508	LIST_HEAD(invalid_list);
				3509
				3510	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3511	return;
				3512
				3513	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
				3514	(vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL \|\|
				3515	vcpu->arch.mmu.direct_map)) {
				3516	hpa_t root = vcpu->arch.mmu.root_hpa;
				3517
				3518	spin_lock(&vcpu->kvm->mmu_lock);
				3519	sp = page_header(root);
				3520	--sp->root_count;
				3521	if (!sp->root_count && sp->role.invalid) {
				3522	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
				3523	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				3524	}
				3525	spin_unlock(&vcpu->kvm->mmu_lock);
				3526	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				3527	return;
				3528	}
				3529
				3530	spin_lock(&vcpu->kvm->mmu_lock);
				3531	for (i = 0; i < 4; ++i) {
				3532	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3533
				3534	if (root) {
				3535	root &= PT64_BASE_ADDR_MASK;
				3536	sp = page_header(root);
				3537	--sp->root_count;
				3538	if (!sp->root_count && sp->role.invalid)
				3539	kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
				3540	&invalid_list);
				3541	}
				3542	vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
				3543	}
				3544	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				3545	spin_unlock(&vcpu->kvm->mmu_lock);
				3546	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				3547	}
				3548
				3549	static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
				3550	{
				3551	int ret = 0;
				3552
				3553	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
				3554	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				3555	ret = 1;
				3556	}
				3557
				3558	return ret;
				3559	}
				3560
				3561	static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
				3562	{
				3563	struct kvm_mmu_page *sp;
				3564	unsigned i;
				3565
				3566	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
				3567	spin_lock(&vcpu->kvm->mmu_lock);
				3568	if(make_mmu_pages_available(vcpu) < 0) {
				3569	spin_unlock(&vcpu->kvm->mmu_lock);
				3570	return -ENOSPC;
				3571	}
				3572	sp = kvm_mmu_get_page(vcpu, 0, 0,
				3573	vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
				3574	++sp->root_count;
				3575	spin_unlock(&vcpu->kvm->mmu_lock);
				3576	vcpu->arch.mmu.root_hpa = __pa(sp->spt);
				3577	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
				3578	for (i = 0; i < 4; ++i) {
				3579	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3580
				3581	MMU_WARN_ON(VALID_PAGE(root));
				3582	spin_lock(&vcpu->kvm->mmu_lock);
				3583	if (make_mmu_pages_available(vcpu) < 0) {
				3584	spin_unlock(&vcpu->kvm->mmu_lock);
				3585	return -ENOSPC;
				3586	}
				3587	sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
				3588	i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
				3589	root = __pa(sp->spt);
				3590	++sp->root_count;
				3591	spin_unlock(&vcpu->kvm->mmu_lock);
				3592	vcpu->arch.mmu.pae_root[i] = root \| PT_PRESENT_MASK;
				3593	}
				3594	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
				3595	} else
				3596	BUG();
				3597
				3598	return 0;
				3599	}
				3600
				3601	static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
				3602	{
				3603	struct kvm_mmu_page *sp;
				3604	u64 pdptr, pm_mask;
				3605	gfn_t root_gfn;
				3606	int i;
				3607
				3608	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
				3609
				3610	if (mmu_check_root(vcpu, root_gfn))
				3611	return 1;
				3612
				3613	/*
				3614	* Do we shadow a long mode page table? If so we need to
				3615	* write-protect the guests page table root.
				3616	*/
				3617	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
				3618	hpa_t root = vcpu->arch.mmu.root_hpa;
				3619
				3620	MMU_WARN_ON(VALID_PAGE(root));
				3621
				3622	spin_lock(&vcpu->kvm->mmu_lock);
				3623	if (make_mmu_pages_available(vcpu) < 0) {
				3624	spin_unlock(&vcpu->kvm->mmu_lock);
				3625	return -ENOSPC;
				3626	}
				3627	sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
				3628	vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
				3629	root = __pa(sp->spt);
				3630	++sp->root_count;
				3631	spin_unlock(&vcpu->kvm->mmu_lock);
				3632	vcpu->arch.mmu.root_hpa = root;
				3633	return 0;
				3634	}
				3635
				3636	/*
				3637	* We shadow a 32 bit page table. This may be a legacy 2-level
				3638	* or a PAE 3-level page table. In either case we need to be aware that
				3639	* the shadow page table may be a PAE or a long mode page table.
				3640	*/
				3641	pm_mask = PT_PRESENT_MASK;
				3642	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
				3643	pm_mask \|= PT_ACCESSED_MASK \| PT_WRITABLE_MASK \| PT_USER_MASK;
				3644
				3645	for (i = 0; i < 4; ++i) {
				3646	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3647
				3648	MMU_WARN_ON(VALID_PAGE(root));
				3649	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
				3650	pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
				3651	if (!(pdptr & PT_PRESENT_MASK)) {
				3652	vcpu->arch.mmu.pae_root[i] = 0;
				3653	continue;
				3654	}
				3655	root_gfn = pdptr >> PAGE_SHIFT;
				3656	if (mmu_check_root(vcpu, root_gfn))
				3657	return 1;
				3658	}
				3659	spin_lock(&vcpu->kvm->mmu_lock);
				3660	if (make_mmu_pages_available(vcpu) < 0) {
				3661	spin_unlock(&vcpu->kvm->mmu_lock);
				3662	return -ENOSPC;
				3663	}
				3664	sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
				3665	0, ACC_ALL);
				3666	root = __pa(sp->spt);
				3667	++sp->root_count;
				3668	spin_unlock(&vcpu->kvm->mmu_lock);
				3669
				3670	vcpu->arch.mmu.pae_root[i] = root \| pm_mask;
				3671	}
				3672	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
				3673
				3674	/*
				3675	* If we shadow a 32 bit page table with a long mode page
				3676	* table we enter this path.
				3677	*/
				3678	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
				3679	if (vcpu->arch.mmu.lm_root == NULL) {
				3680	/*
				3681	* The additional page necessary for this is only
				3682	* allocated on demand.
				3683	*/
				3684
				3685	u64 *lm_root;
				3686
				3687	lm_root = (void*)get_zeroed_page(GFP_KERNEL);
				3688	if (lm_root == NULL)
				3689	return 1;
				3690
				3691	lm_root[0] = __pa(vcpu->arch.mmu.pae_root) \| pm_mask;
				3692
				3693	vcpu->arch.mmu.lm_root = lm_root;
				3694	}
				3695
				3696	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
				3697	}
				3698
				3699	return 0;
				3700	}
				3701
				3702	static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
				3703	{
				3704	if (vcpu->arch.mmu.direct_map)
				3705	return mmu_alloc_direct_roots(vcpu);
				3706	else
				3707	return mmu_alloc_shadow_roots(vcpu);
				3708	}
				3709
				3710	static void mmu_sync_roots(struct kvm_vcpu *vcpu)
				3711	{
				3712	int i;
				3713	struct kvm_mmu_page *sp;
				3714
				3715	if (vcpu->arch.mmu.direct_map)
				3716	return;
				3717
				3718	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3719	return;
				3720
				3721	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
				3722	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
				3723	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
				3724	hpa_t root = vcpu->arch.mmu.root_hpa;
				3725	sp = page_header(root);
				3726	mmu_sync_children(vcpu, sp);
				3727	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
				3728	return;
				3729	}
				3730	for (i = 0; i < 4; ++i) {
				3731	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3732
				3733	if (root && VALID_PAGE(root)) {
				3734	root &= PT64_BASE_ADDR_MASK;
				3735	sp = page_header(root);
				3736	mmu_sync_children(vcpu, sp);
				3737	}
				3738	}
				3739	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
				3740	}
				3741
				3742	void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
				3743	{
				3744	spin_lock(&vcpu->kvm->mmu_lock);
				3745	mmu_sync_roots(vcpu);
				3746	spin_unlock(&vcpu->kvm->mmu_lock);
				3747	}
				3748	EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
				3749
				3750	static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
				3751	u32 access, struct x86_exception *exception)
				3752	{
				3753	if (exception)
				3754	exception->error_code = 0;
				3755	return vaddr;
				3756	}
				3757
				3758	static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
				3759	u32 access,
				3760	struct x86_exception *exception)
				3761	{
				3762	if (exception)
				3763	exception->error_code = 0;
				3764	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
				3765	}
				3766
				3767	static bool
				3768	__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
				3769	{
				3770	int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
				3771
				3772	return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) \|
				3773	((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
				3774	}
				3775
				3776	static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
				3777	{
				3778	return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
				3779	}
				3780
				3781	static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
				3782	{
				3783	return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
				3784	}
				3785
				3786	static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
				3787	{
				3788	/*
				3789	* A nested guest cannot use the MMIO cache if it is using nested
				3790	* page tables, because cr2 is a nGPA while the cache stores GPAs.
				3791	*/
				3792	if (mmu_is_nested(vcpu))
				3793	return false;
				3794
				3795	if (direct)
				3796	return vcpu_match_mmio_gpa(vcpu, addr);
				3797
				3798	return vcpu_match_mmio_gva(vcpu, addr);
				3799	}
				3800
				3801	/* return true if reserved bit is detected on spte. */
				3802	static bool
				3803	walk_shadow_page_get_mmio_spte(struct kvm_vcpu vcpu, u64 addr, u64 sptep)
				3804	{
				3805	struct kvm_shadow_walk_iterator iterator;
				3806	u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
				3807	int root, leaf;
				3808	bool reserved = false;
				3809
				3810	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3811	goto exit;
				3812
				3813	walk_shadow_page_lockless_begin(vcpu);
				3814
				3815	for (shadow_walk_init(&iterator, vcpu, addr),
				3816	leaf = root = iterator.level;
				3817	shadow_walk_okay(&iterator);
				3818	__shadow_walk_next(&iterator, spte)) {
				3819	spte = mmu_spte_get_lockless(iterator.sptep);
				3820
				3821	sptes[leaf - 1] = spte;
				3822	leaf--;
				3823
				3824	if (!is_shadow_present_pte(spte))
				3825	break;
				3826
				3827	reserved \|= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
				3828	iterator.level);
				3829	}
				3830
				3831	walk_shadow_page_lockless_end(vcpu);
				3832
				3833	if (reserved) {
				3834	pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
				3835	__func__, addr);
				3836	while (root > leaf) {
				3837	pr_err("------ spte 0x%llx level %d.\n",
				3838	sptes[root - 1], root);
				3839	root--;
				3840	}
				3841	}
				3842	exit:
				3843	*sptep = spte;
				3844	return reserved;
				3845	}
				3846
				3847	static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
				3848	{
				3849	u64 spte;
				3850	bool reserved;
				3851
				3852	if (mmio_info_in_cache(vcpu, addr, direct))
				3853	return RET_PF_EMULATE;
				3854
				3855	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
				3856	if (WARN_ON(reserved))
				3857	return -EINVAL;
				3858
				3859	if (is_mmio_spte(spte)) {
				3860	gfn_t gfn = get_mmio_spte_gfn(spte);
				3861	unsigned access = get_mmio_spte_access(spte);
				3862
				3863	if (!check_mmio_spte(vcpu, spte))
				3864	return RET_PF_INVALID;
				3865
				3866	if (direct)
				3867	addr = 0;
				3868
				3869	trace_handle_mmio_page_fault(addr, gfn, access);
				3870	vcpu_cache_mmio_info(vcpu, addr, gfn, access);
				3871	return RET_PF_EMULATE;
				3872	}
				3873
				3874	/*
				3875	* If the page table is zapped by other cpus, let CPU fault again on
				3876	* the address.
				3877	*/
				3878	return RET_PF_RETRY;
				3879	}
				3880	EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
				3881
				3882	static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
				3883	u32 error_code, gfn_t gfn)
				3884	{
				3885	if (unlikely(error_code & PFERR_RSVD_MASK))
				3886	return false;
				3887
				3888	if (!(error_code & PFERR_PRESENT_MASK) \|\|
				3889	!(error_code & PFERR_WRITE_MASK))
				3890	return false;
				3891
				3892	/*
				3893	* guest is writing the page which is write tracked which can
				3894	* not be fixed by page fault handler.
				3895	*/
				3896	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
				3897	return true;
				3898
				3899	return false;
				3900	}
				3901
				3902	static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
				3903	{
				3904	struct kvm_shadow_walk_iterator iterator;
				3905	u64 spte;
				3906
				3907	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3908	return;
				3909
				3910	walk_shadow_page_lockless_begin(vcpu);
				3911	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
				3912	clear_sp_write_flooding_count(iterator.sptep);
				3913	if (!is_shadow_present_pte(spte))
				3914	break;
				3915	}
				3916	walk_shadow_page_lockless_end(vcpu);
				3917	}
				3918
				3919	static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
				3920	u32 error_code, bool prefault)
				3921	{
				3922	gfn_t gfn = gva >> PAGE_SHIFT;
				3923	int r;
				3924
				3925	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
				3926
				3927	if (page_fault_handle_page_track(vcpu, error_code, gfn))
				3928	return RET_PF_EMULATE;
				3929
				3930	r = mmu_topup_memory_caches(vcpu);
				3931	if (r)
				3932	return r;
				3933
				3934	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
				3935
				3936
				3937	return nonpaging_map(vcpu, gva & PAGE_MASK,
				3938	error_code, gfn, prefault);
				3939	}
				3940
				3941	static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
				3942	{
				3943	struct kvm_arch_async_pf arch;
				3944
				3945	arch.token = (vcpu->arch.apf.id++ << 12) \| vcpu->vcpu_id;
				3946	arch.gfn = gfn;
				3947	arch.direct_map = vcpu->arch.mmu.direct_map;
				3948	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
				3949
				3950	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
				3951	}
				3952
				3953	bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
				3954	{
				3955	if (unlikely(!lapic_in_kernel(vcpu) \|\|
				3956	kvm_event_needs_reinjection(vcpu) \|\|
				3957	vcpu->arch.exception.pending))
				3958	return false;
				3959
				3960	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
				3961	return false;
				3962
				3963	return kvm_x86_ops->interrupt_allowed(vcpu);
				3964	}
				3965
				3966	static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
				3967	gva_t gva, kvm_pfn_t pfn, bool write, bool writable)
				3968	{
				3969	struct kvm_memory_slot *slot;
				3970	bool async;
				3971
				3972	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				3973	async = false;
				3974	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
				3975	if (!async)
				3976	return false; /* pfn has correct page already /
				3977
				3978	if (!prefault && kvm_can_do_async_pf(vcpu)) {
				3979	trace_kvm_try_async_get_page(gva, gfn);
				3980	if (kvm_find_async_pf_gfn(vcpu, gfn)) {
				3981	trace_kvm_async_pf_doublefault(gva, gfn);
				3982	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
				3983	return true;
				3984	} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
				3985	return true;
				3986	}
				3987
				3988	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
				3989	return false;
				3990	}
				3991
				3992	int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
				3993	u64 fault_address, char *insn, int insn_len,
				3994	bool need_unprotect)
				3995	{
				3996	int r = 1;
				3997
				3998	vcpu->arch.l1tf_flush_l1d = true;
				3999	switch (vcpu->arch.apf.host_apf_reason) {
				4000	default:
				4001	trace_kvm_page_fault(fault_address, error_code);
				4002
				4003	if (need_unprotect && kvm_event_needs_reinjection(vcpu))
				4004	kvm_mmu_unprotect_page_virt(vcpu, fault_address);
				4005	r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
				4006	insn_len);
				4007	break;
				4008	case KVM_PV_REASON_PAGE_NOT_PRESENT:
				4009	vcpu->arch.apf.host_apf_reason = 0;
				4010	local_irq_disable();
				4011	kvm_async_pf_task_wait(fault_address, 0);
				4012	local_irq_enable();
				4013	break;
				4014	case KVM_PV_REASON_PAGE_READY:
				4015	vcpu->arch.apf.host_apf_reason = 0;
				4016	local_irq_disable();
				4017	kvm_async_pf_task_wake(fault_address);
				4018	local_irq_enable();
				4019	break;
				4020	}
				4021	return r;
				4022	}
				4023	EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
				4024
				4025	static bool
				4026	check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
				4027	{
				4028	int page_num = KVM_PAGES_PER_HPAGE(level);
				4029
				4030	gfn &= ~(page_num - 1);
				4031
				4032	return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
				4033	}
				4034
				4035	static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
				4036	bool prefault)
				4037	{
				4038	kvm_pfn_t pfn;
				4039	int r;
				4040	int level;
				4041	bool force_pt_level;
				4042	gfn_t gfn = gpa >> PAGE_SHIFT;
				4043	unsigned long mmu_seq;
				4044	int write = error_code & PFERR_WRITE_MASK;
				4045	bool map_writable;
				4046	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
				4047	is_nx_huge_page_enabled();
				4048
				4049	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
				4050
				4051	if (page_fault_handle_page_track(vcpu, error_code, gfn))
				4052	return RET_PF_EMULATE;
				4053
				4054	r = mmu_topup_memory_caches(vcpu);
				4055	if (r)
				4056	return r;
				4057
				4058	force_pt_level =
				4059	lpage_disallowed \|\|
				4060	!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
				4061	level = mapping_level(vcpu, gfn, &force_pt_level);
				4062	if (likely(!force_pt_level)) {
				4063	if (level > PT_DIRECTORY_LEVEL &&
				4064	!check_hugepage_cache_consistency(vcpu, gfn, level))
				4065	level = PT_DIRECTORY_LEVEL;
				4066	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
				4067	}
				4068
				4069	if (fast_page_fault(vcpu, gpa, level, error_code))
				4070	return RET_PF_RETRY;
				4071
				4072	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				4073	smp_rmb();
				4074
				4075	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
				4076	return RET_PF_RETRY;
				4077
				4078	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
				4079	return r;
				4080
				4081	r = RET_PF_RETRY;
				4082	spin_lock(&vcpu->kvm->mmu_lock);
				4083	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				4084	goto out_unlock;
				4085	if (make_mmu_pages_available(vcpu) < 0)
				4086	goto out_unlock;
				4087	if (likely(!force_pt_level))
				4088	transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
				4089	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
				4090	prefault, lpage_disallowed);
				4091	out_unlock:
				4092	spin_unlock(&vcpu->kvm->mmu_lock);
				4093	kvm_release_pfn_clean(pfn);
				4094	return r;
				4095	}
				4096
				4097	static void nonpaging_init_context(struct kvm_vcpu *vcpu,
				4098	struct kvm_mmu *context)
				4099	{
				4100	context->page_fault = nonpaging_page_fault;
				4101	context->gva_to_gpa = nonpaging_gva_to_gpa;
				4102	context->sync_page = nonpaging_sync_page;
				4103	context->invlpg = nonpaging_invlpg;
				4104	context->update_pte = nonpaging_update_pte;
				4105	context->root_level = 0;
				4106	context->shadow_root_level = PT32E_ROOT_LEVEL;
				4107	context->root_hpa = INVALID_PAGE;
				4108	context->direct_map = true;
				4109	context->nx = false;
				4110	}
				4111
				4112	void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
				4113	{
				4114	mmu_free_roots(vcpu);
				4115	}
				4116
				4117	static unsigned long get_cr3(struct kvm_vcpu *vcpu)
				4118	{
				4119	return kvm_read_cr3(vcpu);
				4120	}
				4121
				4122	static void inject_page_fault(struct kvm_vcpu *vcpu,
				4123	struct x86_exception *fault)
				4124	{
				4125	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
				4126	}
				4127
				4128	static bool sync_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
				4129	unsigned access, int *nr_present)
				4130	{
				4131	if (unlikely(is_mmio_spte(*sptep))) {
				4132	if (gfn != get_mmio_spte_gfn(*sptep)) {
				4133	mmu_spte_clear_no_track(sptep);
				4134	return true;
				4135	}
				4136
				4137	(*nr_present)++;
				4138	mark_mmio_spte(vcpu, sptep, gfn, access);
				4139	return true;
				4140	}
				4141
				4142	return false;
				4143	}
				4144
				4145	static inline bool is_last_gpte(struct kvm_mmu *mmu,
				4146	unsigned level, unsigned gpte)
				4147	{
				4148	/*
				4149	* The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
				4150	* If it is clear, there are no large pages at this level, so clear
				4151	* PT_PAGE_SIZE_MASK in gpte if that is the case.
				4152	*/
				4153	gpte &= level - mmu->last_nonleaf_level;
				4154
				4155	/*
				4156	* PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
				4157	* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
				4158	* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
				4159	*/
				4160	gpte \|= level - PT_PAGE_TABLE_LEVEL - 1;
				4161
				4162	return gpte & PT_PAGE_SIZE_MASK;
				4163	}
				4164
				4165	#define PTTYPE_EPT 18 /* arbitrary */
				4166	#define PTTYPE PTTYPE_EPT
				4167	#include "paging_tmpl.h"
				4168	#undef PTTYPE
				4169
				4170	#define PTTYPE 64
				4171	#include "paging_tmpl.h"
				4172	#undef PTTYPE
				4173
				4174	#define PTTYPE 32
				4175	#include "paging_tmpl.h"
				4176	#undef PTTYPE
				4177
				4178	static void
				4179	__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				4180	struct rsvd_bits_validate *rsvd_check,
				4181	int maxphyaddr, int level, bool nx, bool gbpages,
				4182	bool pse, bool amd)
				4183	{
				4184	u64 exb_bit_rsvd = 0;
				4185	u64 gbpages_bit_rsvd = 0;
				4186	u64 nonleaf_bit8_rsvd = 0;
				4187
				4188	rsvd_check->bad_mt_xwr = 0;
				4189
				4190	if (!nx)
				4191	exb_bit_rsvd = rsvd_bits(63, 63);
				4192	if (!gbpages)
				4193	gbpages_bit_rsvd = rsvd_bits(7, 7);
				4194
				4195	/*
				4196	* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
				4197	* leaf entries) on AMD CPUs only.
				4198	*/
				4199	if (amd)
				4200	nonleaf_bit8_rsvd = rsvd_bits(8, 8);
				4201
				4202	switch (level) {
				4203	case PT32_ROOT_LEVEL:
				4204	/* no rsvd bits for 2 level 4K page table entries */
				4205	rsvd_check->rsvd_bits_mask[0][1] = 0;
				4206	rsvd_check->rsvd_bits_mask[0][0] = 0;
				4207	rsvd_check->rsvd_bits_mask[1][0] =
				4208	rsvd_check->rsvd_bits_mask[0][0];
				4209
				4210	if (!pse) {
				4211	rsvd_check->rsvd_bits_mask[1][1] = 0;
				4212	break;
				4213	}
				4214
				4215	if (is_cpuid_PSE36())
				4216	/* 36bits PSE 4MB page */
				4217	rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
				4218	else
				4219	/* 32 bits PSE 4MB page */
				4220	rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
				4221	break;
				4222	case PT32E_ROOT_LEVEL:
				4223	rsvd_check->rsvd_bits_mask[0][2] =
				4224	rsvd_bits(maxphyaddr, 63) \|
				4225	rsvd_bits(5, 8) \| rsvd_bits(1, 2); /* PDPTE */
				4226	rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd \|
				4227	rsvd_bits(maxphyaddr, 62); /* PDE */
				4228	rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd \|
				4229	rsvd_bits(maxphyaddr, 62); /* PTE */
				4230	rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd \|
				4231	rsvd_bits(maxphyaddr, 62) \|
				4232	rsvd_bits(13, 20); /* large page */
				4233	rsvd_check->rsvd_bits_mask[1][0] =
				4234	rsvd_check->rsvd_bits_mask[0][0];
				4235	break;
				4236	case PT64_ROOT_5LEVEL:
				4237	rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd \|
				4238	nonleaf_bit8_rsvd \| rsvd_bits(7, 7) \|
				4239	rsvd_bits(maxphyaddr, 51);
				4240	rsvd_check->rsvd_bits_mask[1][4] =
				4241	rsvd_check->rsvd_bits_mask[0][4];
				4242	case PT64_ROOT_4LEVEL:
				4243	rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd \|
				4244	nonleaf_bit8_rsvd \| rsvd_bits(7, 7) \|
				4245	rsvd_bits(maxphyaddr, 51);
				4246	rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd \|
				4247	gbpages_bit_rsvd \|
				4248	rsvd_bits(maxphyaddr, 51);
				4249	rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd \|
				4250	rsvd_bits(maxphyaddr, 51);
				4251	rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd \|
				4252	rsvd_bits(maxphyaddr, 51);
				4253	rsvd_check->rsvd_bits_mask[1][3] =
				4254	rsvd_check->rsvd_bits_mask[0][3];
				4255	rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd \|
				4256	gbpages_bit_rsvd \| rsvd_bits(maxphyaddr, 51) \|
				4257	rsvd_bits(13, 29);
				4258	rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd \|
				4259	rsvd_bits(maxphyaddr, 51) \|
				4260	rsvd_bits(13, 20); /* large page */
				4261	rsvd_check->rsvd_bits_mask[1][0] =
				4262	rsvd_check->rsvd_bits_mask[0][0];
				4263	break;
				4264	}
				4265	}
				4266
				4267	static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				4268	struct kvm_mmu *context)
				4269	{
				4270	__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
				4271	cpuid_maxphyaddr(vcpu), context->root_level,
				4272	context->nx,
				4273	guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
				4274	is_pse(vcpu), guest_cpuid_is_amd(vcpu));
				4275	}
				4276
				4277	static void
				4278	__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
				4279	int maxphyaddr, bool execonly)
				4280	{
				4281	u64 bad_mt_xwr;
				4282
				4283	rsvd_check->rsvd_bits_mask[0][4] =
				4284	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 7);
				4285	rsvd_check->rsvd_bits_mask[0][3] =
				4286	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 7);
				4287	rsvd_check->rsvd_bits_mask[0][2] =
				4288	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 6);
				4289	rsvd_check->rsvd_bits_mask[0][1] =
				4290	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 6);
				4291	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
				4292
				4293	/* large page */
				4294	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
				4295	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
				4296	rsvd_check->rsvd_bits_mask[1][2] =
				4297	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(12, 29);
				4298	rsvd_check->rsvd_bits_mask[1][1] =
				4299	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(12, 20);
				4300	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
				4301
				4302	bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
				4303	bad_mt_xwr \|= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
				4304	bad_mt_xwr \|= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
				4305	bad_mt_xwr \|= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
				4306	bad_mt_xwr \|= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
				4307	if (!execonly) {
				4308	/* bits 0..2 must not be 100 unless VMX capabilities allow it */
				4309	bad_mt_xwr \|= REPEAT_BYTE(1ull << 4);
				4310	}
				4311	rsvd_check->bad_mt_xwr = bad_mt_xwr;
				4312	}
				4313
				4314	static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
				4315	struct kvm_mmu *context, bool execonly)
				4316	{
				4317	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
				4318	cpuid_maxphyaddr(vcpu), execonly);
				4319	}
				4320
				4321	/*
				4322	* the page table on host is the shadow page table for the page
				4323	* table in guest or amd nested guest, its mmu features completely
				4324	* follow the features in guest.
				4325	*/
				4326	void
				4327	reset_shadow_zero_bits_mask(struct kvm_vcpu vcpu, struct kvm_mmu context)
				4328	{
				4329	bool uses_nx = context->nx \|\| context->base_role.smep_andnot_wp;
				4330	struct rsvd_bits_validate *shadow_zero_check;
				4331	int i;
				4332
				4333	/*
				4334	* Passing "true" to the last argument is okay; it adds a check
				4335	* on bit 8 of the SPTEs which KVM doesn't use anyway.
				4336	*/
				4337	shadow_zero_check = &context->shadow_zero_check;
				4338	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
				4339	shadow_phys_bits,
				4340	context->shadow_root_level, uses_nx,
				4341	guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
				4342	is_pse(vcpu), true);
				4343
				4344	if (!shadow_me_mask)
				4345	return;
				4346
				4347	for (i = context->shadow_root_level; --i >= 0;) {
				4348	shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
				4349	shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
				4350	}
				4351
				4352	}
				4353	EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
				4354
				4355	static inline bool boot_cpu_is_amd(void)
				4356	{
				4357	WARN_ON_ONCE(!tdp_enabled);
				4358	return shadow_x_mask == 0;
				4359	}
				4360
				4361	/*
				4362	* the direct page table on host, use as much mmu features as
				4363	* possible, however, kvm currently does not do execution-protection.
				4364	*/
				4365	static void
				4366	reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				4367	struct kvm_mmu *context)
				4368	{
				4369	struct rsvd_bits_validate *shadow_zero_check;
				4370	int i;
				4371
				4372	shadow_zero_check = &context->shadow_zero_check;
				4373
				4374	if (boot_cpu_is_amd())
				4375	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
				4376	shadow_phys_bits,
				4377	context->shadow_root_level, false,
				4378	boot_cpu_has(X86_FEATURE_GBPAGES),
				4379	true, true);
				4380	else
				4381	__reset_rsvds_bits_mask_ept(shadow_zero_check,
				4382	shadow_phys_bits,
				4383	false);
				4384
				4385	if (!shadow_me_mask)
				4386	return;
				4387
				4388	for (i = context->shadow_root_level; --i >= 0;) {
				4389	shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
				4390	shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
				4391	}
				4392	}
				4393
				4394	/*
				4395	* as the comments in reset_shadow_zero_bits_mask() except it
				4396	* is the shadow page table for intel nested guest.
				4397	*/
				4398	static void
				4399	reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				4400	struct kvm_mmu *context, bool execonly)
				4401	{
				4402	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
				4403	shadow_phys_bits, execonly);
				4404	}
				4405
				4406	#define BYTE_MASK(access) \
				4407	((1 & (access) ? 2 : 0) \| \
				4408	(2 & (access) ? 4 : 0) \| \
				4409	(3 & (access) ? 8 : 0) \| \
				4410	(4 & (access) ? 16 : 0) \| \
				4411	(5 & (access) ? 32 : 0) \| \
				4412	(6 & (access) ? 64 : 0) \| \
				4413	(7 & (access) ? 128 : 0))
				4414
				4415
				4416	static void update_permission_bitmask(struct kvm_vcpu *vcpu,
				4417	struct kvm_mmu *mmu, bool ept)
				4418	{
				4419	unsigned byte;
				4420
				4421	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
				4422	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
				4423	const u8 u = BYTE_MASK(ACC_USER_MASK);
				4424
				4425	bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
				4426	bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
				4427	bool cr0_wp = is_write_protection(vcpu);
				4428
				4429	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
				4430	unsigned pfec = byte << 1;
				4431
				4432	/*
				4433	* Each "*f" variable has a 1 bit for each UWX value
				4434	* that causes a fault with the given PFEC.
				4435	*/
				4436
				4437	/* Faults from writes to non-writable pages */
				4438	u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
				4439	/* Faults from user mode accesses to supervisor pages */
				4440	u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
				4441	/* Faults from fetches of non-executable pages*/
				4442	u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
				4443	/* Faults from kernel mode fetches of user pages */
				4444	u8 smepf = 0;
				4445	/* Faults from kernel mode accesses of user pages */
				4446	u8 smapf = 0;
				4447
				4448	if (!ept) {
				4449	/* Faults from kernel mode accesses to user pages */
				4450	u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
				4451
				4452	/* Not really needed: !nx will cause pte.nx to fault */
				4453	if (!mmu->nx)
				4454	ff = 0;
				4455
				4456	/* Allow supervisor writes if !cr0.wp */
				4457	if (!cr0_wp)
				4458	wf = (pfec & PFERR_USER_MASK) ? wf : 0;
				4459
				4460	/* Disallow supervisor fetches of user code if cr4.smep */
				4461	if (cr4_smep)
				4462	smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
				4463
				4464	/*
				4465	* SMAP:kernel-mode data accesses from user-mode
				4466	* mappings should fault. A fault is considered
				4467	* as a SMAP violation if all of the following
				4468	* conditions are ture:
				4469	* - X86_CR4_SMAP is set in CR4
				4470	* - A user page is accessed
				4471	* - The access is not a fetch
				4472	* - Page fault in kernel mode
				4473	* - if CPL = 3 or X86_EFLAGS_AC is clear
				4474	*
				4475	* Here, we cover the first three conditions.
				4476	* The fourth is computed dynamically in permission_fault();
				4477	* PFERR_RSVD_MASK bit will be set in PFEC if the access is
				4478	* not subject to SMAP restrictions.
				4479	*/
				4480	if (cr4_smap)
				4481	smapf = (pfec & (PFERR_RSVD_MASK\|PFERR_FETCH_MASK)) ? 0 : kf;
				4482	}
				4483
				4484	mmu->permissions[byte] = ff \| uf \| wf \| smepf \| smapf;
				4485	}
				4486	}
				4487
				4488	/*
				4489	* PKU is an additional mechanism by which the paging controls access to
				4490	* user-mode addresses based on the value in the PKRU register. Protection
				4491	* key violations are reported through a bit in the page fault error code.
				4492	* Unlike other bits of the error code, the PK bit is not known at the
				4493	* call site of e.g. gva_to_gpa; it must be computed directly in
				4494	* permission_fault based on two bits of PKRU, on some machine state (CR4,
				4495	* CR0, EFER, CPL), and on other bits of the error code and the page tables.
				4496	*
				4497	* In particular the following conditions come from the error code, the
				4498	* page tables and the machine state:
				4499	* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
				4500	* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
				4501	* - PK is always zero if U=0 in the page tables
				4502	* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
				4503	*
				4504	* The PKRU bitmask caches the result of these four conditions. The error
				4505	* code (minus the P bit) and the page table's U bit form an index into the
				4506	* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
				4507	* with the two bits of the PKRU register corresponding to the protection key.
				4508	* For the first three conditions above the bits will be 00, thus masking
				4509	* away both AD and WD. For all reads or if the last condition holds, WD
				4510	* only will be masked away.
				4511	*/
				4512	static void update_pkru_bitmask(struct kvm_vcpu vcpu, struct kvm_mmu mmu,
				4513	bool ept)
				4514	{
				4515	unsigned bit;
				4516	bool wp;
				4517
				4518	if (ept) {
				4519	mmu->pkru_mask = 0;
				4520	return;
				4521	}
				4522
				4523	/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
				4524	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\| !is_long_mode(vcpu)) {
				4525	mmu->pkru_mask = 0;
				4526	return;
				4527	}
				4528
				4529	wp = is_write_protection(vcpu);
				4530
				4531	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
				4532	unsigned pfec, pkey_bits;
				4533	bool check_pkey, check_write, ff, uf, wf, pte_user;
				4534
				4535	pfec = bit << 1;
				4536	ff = pfec & PFERR_FETCH_MASK;
				4537	uf = pfec & PFERR_USER_MASK;
				4538	wf = pfec & PFERR_WRITE_MASK;
				4539
				4540	/* PFEC.RSVD is replaced by ACC_USER_MASK. */
				4541	pte_user = pfec & PFERR_RSVD_MASK;
				4542
				4543	/*
				4544	* Only need to check the access which is not an
				4545	* instruction fetch and is to a user page.
				4546	*/
				4547	check_pkey = (!ff && pte_user);
				4548	/*
				4549	* write access is controlled by PKRU if it is a
				4550	* user access or CR0.WP = 1.
				4551	*/
				4552	check_write = check_pkey && wf && (uf \|\| wp);
				4553
				4554	/* PKRU.AD stops both read and write access. */
				4555	pkey_bits = !!check_pkey;
				4556	/* PKRU.WD stops write access. */
				4557	pkey_bits \|= (!!check_write) << 1;
				4558
				4559	mmu->pkru_mask \|= (pkey_bits & 3) << pfec;
				4560	}
				4561	}
				4562
				4563	static void update_last_nonleaf_level(struct kvm_vcpu vcpu, struct kvm_mmu mmu)
				4564	{
				4565	unsigned root_level = mmu->root_level;
				4566
				4567	mmu->last_nonleaf_level = root_level;
				4568	if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
				4569	mmu->last_nonleaf_level++;
				4570	}
				4571
				4572	static void paging64_init_context_common(struct kvm_vcpu *vcpu,
				4573	struct kvm_mmu *context,
				4574	int level)
				4575	{
				4576	context->nx = is_nx(vcpu);
				4577	context->root_level = level;
				4578
				4579	reset_rsvds_bits_mask(vcpu, context);
				4580	update_permission_bitmask(vcpu, context, false);
				4581	update_pkru_bitmask(vcpu, context, false);
				4582	update_last_nonleaf_level(vcpu, context);
				4583
				4584	MMU_WARN_ON(!is_pae(vcpu));
				4585	context->page_fault = paging64_page_fault;
				4586	context->gva_to_gpa = paging64_gva_to_gpa;
				4587	context->sync_page = paging64_sync_page;
				4588	context->invlpg = paging64_invlpg;
				4589	context->update_pte = paging64_update_pte;
				4590	context->shadow_root_level = level;
				4591	context->root_hpa = INVALID_PAGE;
				4592	context->direct_map = false;
				4593	}
				4594
				4595	static void paging64_init_context(struct kvm_vcpu *vcpu,
				4596	struct kvm_mmu *context)
				4597	{
				4598	int root_level = is_la57_mode(vcpu) ?
				4599	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4600
				4601	paging64_init_context_common(vcpu, context, root_level);
				4602	}
				4603
				4604	static void paging32_init_context(struct kvm_vcpu *vcpu,
				4605	struct kvm_mmu *context)
				4606	{
				4607	context->nx = false;
				4608	context->root_level = PT32_ROOT_LEVEL;
				4609
				4610	reset_rsvds_bits_mask(vcpu, context);
				4611	update_permission_bitmask(vcpu, context, false);
				4612	update_pkru_bitmask(vcpu, context, false);
				4613	update_last_nonleaf_level(vcpu, context);
				4614
				4615	context->page_fault = paging32_page_fault;
				4616	context->gva_to_gpa = paging32_gva_to_gpa;
				4617	context->sync_page = paging32_sync_page;
				4618	context->invlpg = paging32_invlpg;
				4619	context->update_pte = paging32_update_pte;
				4620	context->shadow_root_level = PT32E_ROOT_LEVEL;
				4621	context->root_hpa = INVALID_PAGE;
				4622	context->direct_map = false;
				4623	}
				4624
				4625	static void paging32E_init_context(struct kvm_vcpu *vcpu,
				4626	struct kvm_mmu *context)
				4627	{
				4628	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
				4629	}
				4630
				4631	static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
				4632	{
				4633	struct kvm_mmu *context = &vcpu->arch.mmu;
				4634
				4635	context->base_role.word = 0;
				4636	context->base_role.smm = is_smm(vcpu);
				4637	context->base_role.ad_disabled = (shadow_accessed_mask == 0);
				4638	context->page_fault = tdp_page_fault;
				4639	context->sync_page = nonpaging_sync_page;
				4640	context->invlpg = nonpaging_invlpg;
				4641	context->update_pte = nonpaging_update_pte;
				4642	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
				4643	context->root_hpa = INVALID_PAGE;
				4644	context->direct_map = true;
				4645	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
				4646	context->get_cr3 = get_cr3;
				4647	context->get_pdptr = kvm_pdptr_read;
				4648	context->inject_page_fault = kvm_inject_page_fault;
				4649
				4650	if (!is_paging(vcpu)) {
				4651	context->nx = false;
				4652	context->gva_to_gpa = nonpaging_gva_to_gpa;
				4653	context->root_level = 0;
				4654	} else if (is_long_mode(vcpu)) {
				4655	context->nx = is_nx(vcpu);
				4656	context->root_level = is_la57_mode(vcpu) ?
				4657	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4658	reset_rsvds_bits_mask(vcpu, context);
				4659	context->gva_to_gpa = paging64_gva_to_gpa;
				4660	} else if (is_pae(vcpu)) {
				4661	context->nx = is_nx(vcpu);
				4662	context->root_level = PT32E_ROOT_LEVEL;
				4663	reset_rsvds_bits_mask(vcpu, context);
				4664	context->gva_to_gpa = paging64_gva_to_gpa;
				4665	} else {
				4666	context->nx = false;
				4667	context->root_level = PT32_ROOT_LEVEL;
				4668	reset_rsvds_bits_mask(vcpu, context);
				4669	context->gva_to_gpa = paging32_gva_to_gpa;
				4670	}
				4671
				4672	update_permission_bitmask(vcpu, context, false);
				4673	update_pkru_bitmask(vcpu, context, false);
				4674	update_last_nonleaf_level(vcpu, context);
				4675	reset_tdp_shadow_zero_bits_mask(vcpu, context);
				4676	}
				4677
				4678	void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
				4679	{
				4680	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
				4681	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
				4682	struct kvm_mmu *context = &vcpu->arch.mmu;
				4683
				4684	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
				4685
				4686	if (!is_paging(vcpu))
				4687	nonpaging_init_context(vcpu, context);
				4688	else if (is_long_mode(vcpu))
				4689	paging64_init_context(vcpu, context);
				4690	else if (is_pae(vcpu))
				4691	paging32E_init_context(vcpu, context);
				4692	else
				4693	paging32_init_context(vcpu, context);
				4694
				4695	context->base_role.nxe = is_nx(vcpu);
				4696	context->base_role.cr4_pae = !!is_pae(vcpu);
				4697	context->base_role.cr0_wp = is_write_protection(vcpu);
				4698	context->base_role.smep_andnot_wp
				4699	= smep && !is_write_protection(vcpu);
				4700	context->base_role.smap_andnot_wp
				4701	= smap && !is_write_protection(vcpu);
				4702	context->base_role.smm = is_smm(vcpu);
				4703	reset_shadow_zero_bits_mask(vcpu, context);
				4704	}
				4705	EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
				4706
				4707	void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
				4708	bool accessed_dirty)
				4709	{
				4710	struct kvm_mmu *context = &vcpu->arch.mmu;
				4711
				4712	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
				4713
				4714	context->shadow_root_level = PT64_ROOT_4LEVEL;
				4715
				4716	context->nx = true;
				4717	context->ept_ad = accessed_dirty;
				4718	context->page_fault = ept_page_fault;
				4719	context->gva_to_gpa = ept_gva_to_gpa;
				4720	context->sync_page = ept_sync_page;
				4721	context->invlpg = ept_invlpg;
				4722	context->update_pte = ept_update_pte;
				4723	context->root_level = PT64_ROOT_4LEVEL;
				4724	context->root_hpa = INVALID_PAGE;
				4725	context->direct_map = false;
				4726	context->base_role.ad_disabled = !accessed_dirty;
				4727
				4728	update_permission_bitmask(vcpu, context, true);
				4729	update_pkru_bitmask(vcpu, context, true);
				4730	update_last_nonleaf_level(vcpu, context);
				4731	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
				4732	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
				4733	}
				4734	EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
				4735
				4736	static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
				4737	{
				4738	struct kvm_mmu *context = &vcpu->arch.mmu;
				4739
				4740	kvm_init_shadow_mmu(vcpu);
				4741	context->set_cr3 = kvm_x86_ops->set_cr3;
				4742	context->get_cr3 = get_cr3;
				4743	context->get_pdptr = kvm_pdptr_read;
				4744	context->inject_page_fault = kvm_inject_page_fault;
				4745	}
				4746
				4747	static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
				4748	{
				4749	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
				4750
				4751	g_context->get_cr3 = get_cr3;
				4752	g_context->get_pdptr = kvm_pdptr_read;
				4753	g_context->inject_page_fault = kvm_inject_page_fault;
				4754
				4755	/*
				4756	* Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
				4757	* L1's nested page tables (e.g. EPT12). The nested translation
				4758	* of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
				4759	* L2's page tables as the first level of translation and L1's
				4760	* nested page tables as the second level of translation. Basically
				4761	* the gva_to_gpa functions between mmu and nested_mmu are swapped.
				4762	*/
				4763	if (!is_paging(vcpu)) {
				4764	g_context->nx = false;
				4765	g_context->root_level = 0;
				4766	g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
				4767	} else if (is_long_mode(vcpu)) {
				4768	g_context->nx = is_nx(vcpu);
				4769	g_context->root_level = is_la57_mode(vcpu) ?
				4770	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4771	reset_rsvds_bits_mask(vcpu, g_context);
				4772	g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
				4773	} else if (is_pae(vcpu)) {
				4774	g_context->nx = is_nx(vcpu);
				4775	g_context->root_level = PT32E_ROOT_LEVEL;
				4776	reset_rsvds_bits_mask(vcpu, g_context);
				4777	g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
				4778	} else {
				4779	g_context->nx = false;
				4780	g_context->root_level = PT32_ROOT_LEVEL;
				4781	reset_rsvds_bits_mask(vcpu, g_context);
				4782	g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
				4783	}
				4784
				4785	update_permission_bitmask(vcpu, g_context, false);
				4786	update_pkru_bitmask(vcpu, g_context, false);
				4787	update_last_nonleaf_level(vcpu, g_context);
				4788	}
				4789
				4790	static void init_kvm_mmu(struct kvm_vcpu *vcpu)
				4791	{
				4792	if (mmu_is_nested(vcpu))
				4793	init_kvm_nested_mmu(vcpu);
				4794	else if (tdp_enabled)
				4795	init_kvm_tdp_mmu(vcpu);
				4796	else
				4797	init_kvm_softmmu(vcpu);
				4798	}
				4799
				4800	void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
				4801	{
				4802	kvm_mmu_unload(vcpu);
				4803	init_kvm_mmu(vcpu);
				4804	}
				4805	EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
				4806
				4807	int kvm_mmu_load(struct kvm_vcpu *vcpu)
				4808	{
				4809	int r;
				4810
				4811	r = mmu_topup_memory_caches(vcpu);
				4812	if (r)
				4813	goto out;
				4814	r = mmu_alloc_roots(vcpu);
				4815	kvm_mmu_sync_roots(vcpu);
				4816	if (r)
				4817	goto out;
				4818	/* set_cr3() should ensure TLB has been flushed */
				4819	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
				4820	out:
				4821	return r;
				4822	}
				4823	EXPORT_SYMBOL_GPL(kvm_mmu_load);
				4824
				4825	void kvm_mmu_unload(struct kvm_vcpu *vcpu)
				4826	{
				4827	mmu_free_roots(vcpu);
				4828	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
				4829	}
				4830	EXPORT_SYMBOL_GPL(kvm_mmu_unload);
				4831
				4832	static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
				4833	struct kvm_mmu_page sp, u64 spte,
				4834	const void *new)
				4835	{
				4836	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
				4837	++vcpu->kvm->stat.mmu_pde_zapped;
				4838	return;
				4839	}
				4840
				4841	++vcpu->kvm->stat.mmu_pte_updated;
				4842	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
				4843	}
				4844
				4845	static bool need_remote_flush(u64 old, u64 new)
				4846	{
				4847	if (!is_shadow_present_pte(old))
				4848	return false;
				4849	if (!is_shadow_present_pte(new))
				4850	return true;
				4851	if ((old ^ new) & PT64_BASE_ADDR_MASK)
				4852	return true;
				4853	old ^= shadow_nx_mask;
				4854	new ^= shadow_nx_mask;
				4855	return (old & ~new & PT64_PERM_MASK) != 0;
				4856	}
				4857
				4858	static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu vcpu, gpa_t gpa,
				4859	int *bytes)
				4860	{
				4861	u64 gentry = 0;
				4862	int r;
				4863
				4864	/*
				4865	* Assume that the pte write on a page table of the same type
				4866	* as the current vcpu paging mode since we update the sptes only
				4867	* when they have the same mode.
				4868	*/
				4869	if (is_pae(vcpu) && *bytes == 4) {
				4870	/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
				4871	*gpa &= ~(gpa_t)7;
				4872	*bytes = 8;
				4873	}
				4874
				4875	if (bytes == 4 \|\| bytes == 8) {
				4876	r = kvm_vcpu_read_guest_atomic(vcpu, gpa, &gentry, bytes);
				4877	if (r)
				4878	gentry = 0;
				4879	}
				4880
				4881	return gentry;
				4882	}
				4883
				4884	/*
				4885	* If we're seeing too many writes to a page, it may no longer be a page table,
				4886	* or we may be forking, in which case it is better to unmap the page.
				4887	*/
				4888	static bool detect_write_flooding(struct kvm_mmu_page *sp)
				4889	{
				4890	/*
				4891	* Skip write-flooding detected for the sp whose level is 1, because
				4892	* it can become unsync, then the guest page is not write-protected.
				4893	*/
				4894	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
				4895	return false;
				4896
				4897	atomic_inc(&sp->write_flooding_count);
				4898	return atomic_read(&sp->write_flooding_count) >= 3;
				4899	}
				4900
				4901	/*
				4902	* Misaligned accesses are too much trouble to fix up; also, they usually
				4903	* indicate a page is not used as a page table.
				4904	*/
				4905	static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
				4906	int bytes)
				4907	{
				4908	unsigned offset, pte_size, misaligned;
				4909
				4910	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
				4911	gpa, bytes, sp->role.word);
				4912
				4913	offset = offset_in_page(gpa);
				4914	pte_size = sp->role.cr4_pae ? 8 : 4;
				4915
				4916	/*
				4917	* Sometimes, the OS only writes the last one bytes to update status
				4918	* bits, for example, in linux, andb instruction is used in clear_bit().
				4919	*/
				4920	if (!(offset & (pte_size - 1)) && bytes == 1)
				4921	return false;
				4922
				4923	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
				4924	misaligned \|= bytes < 4;
				4925
				4926	return misaligned;
				4927	}
				4928
				4929	static u64 get_written_sptes(struct kvm_mmu_page sp, gpa_t gpa, int *nspte)
				4930	{
				4931	unsigned page_offset, quadrant;
				4932	u64 *spte;
				4933	int level;
				4934
				4935	page_offset = offset_in_page(gpa);
				4936	level = sp->role.level;
				4937	*nspte = 1;
				4938	if (!sp->role.cr4_pae) {
				4939	page_offset <<= 1; /* 32->64 */
				4940	/*
				4941	* A 32-bit pde maps 4MB while the shadow pdes map
				4942	* only 2MB. So we need to double the offset again
				4943	* and zap two pdes instead of one.
				4944	*/
				4945	if (level == PT32_ROOT_LEVEL) {
				4946	page_offset &= ~7; /* kill rounding error */
				4947	page_offset <<= 1;
				4948	*nspte = 2;
				4949	}
				4950	quadrant = page_offset >> PAGE_SHIFT;
				4951	page_offset &= ~PAGE_MASK;
				4952	if (quadrant != sp->role.quadrant)
				4953	return NULL;
				4954	}
				4955
				4956	spte = &sp->spt[page_offset / sizeof(*spte)];
				4957	return spte;
				4958	}
				4959
				4960	static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
				4961	const u8 *new, int bytes,
				4962	struct kvm_page_track_notifier_node *node)
				4963	{
				4964	gfn_t gfn = gpa >> PAGE_SHIFT;
				4965	struct kvm_mmu_page *sp;
				4966	LIST_HEAD(invalid_list);
				4967	u64 entry, gentry, *spte;
				4968	int npte;
				4969	bool remote_flush, local_flush;
				4970	union kvm_mmu_page_role mask = { };
				4971
				4972	mask.cr0_wp = 1;
				4973	mask.cr4_pae = 1;
				4974	mask.nxe = 1;
				4975	mask.smep_andnot_wp = 1;
				4976	mask.smap_andnot_wp = 1;
				4977	mask.smm = 1;
				4978	mask.ad_disabled = 1;
				4979
				4980	/*
				4981	* If we don't have indirect shadow pages, it means no page is
				4982	* write-protected, so we can exit simply.
				4983	*/
				4984	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
				4985	return;
				4986
				4987	remote_flush = local_flush = false;
				4988
				4989	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
				4990
				4991	/*
				4992	* No need to care whether allocation memory is successful
				4993	* or not since pte prefetch is skiped if it does not have
				4994	* enough objects in the cache.
				4995	*/
				4996	mmu_topup_memory_caches(vcpu);
				4997
				4998	spin_lock(&vcpu->kvm->mmu_lock);
				4999
				5000	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
				5001
				5002	++vcpu->kvm->stat.mmu_pte_write;
				5003	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
				5004
				5005	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
				5006	if (detect_write_misaligned(sp, gpa, bytes) \|\|
				5007	detect_write_flooding(sp)) {
				5008	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
				5009	++vcpu->kvm->stat.mmu_flooded;
				5010	continue;
				5011	}
				5012
				5013	spte = get_written_sptes(sp, gpa, &npte);
				5014	if (!spte)
				5015	continue;
				5016
				5017	local_flush = true;
				5018	while (npte--) {
				5019	entry = *spte;
				5020	mmu_page_zap_pte(vcpu->kvm, sp, spte);
				5021	if (gentry &&
				5022	!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
				5023	& mask.word) && rmap_can_add(vcpu))
				5024	mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
				5025	if (need_remote_flush(entry, *spte))
				5026	remote_flush = true;
				5027	++spte;
				5028	}
				5029	}
				5030	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
				5031	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
				5032	spin_unlock(&vcpu->kvm->mmu_lock);
				5033	}
				5034
				5035	int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
				5036	{
				5037	gpa_t gpa;
				5038	int r;
				5039
				5040	if (vcpu->arch.mmu.direct_map)
				5041	return 0;
				5042
				5043	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
				5044
				5045	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
				5046
				5047	return r;
				5048	}
				5049	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
				5050
				5051	static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
				5052	{
				5053	LIST_HEAD(invalid_list);
				5054
				5055	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
				5056	return 0;
				5057
				5058	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
				5059	if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
				5060	break;
				5061
				5062	++vcpu->kvm->stat.mmu_recycled;
				5063	}
				5064	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				5065
				5066	if (!kvm_mmu_available_pages(vcpu->kvm))
				5067	return -ENOSPC;
				5068	return 0;
				5069	}
				5070
				5071	int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
				5072	void *insn, int insn_len)
				5073	{
				5074	int r, emulation_type = EMULTYPE_RETRY;
				5075	enum emulation_result er;
				5076	bool direct = vcpu->arch.mmu.direct_map;
				5077
				5078	/* With shadow page tables, fault_address contains a GVA or nGPA. */
				5079	if (vcpu->arch.mmu.direct_map) {
				5080	vcpu->arch.gpa_available = true;
				5081	vcpu->arch.gpa_val = cr2;
				5082	}
				5083
				5084	r = RET_PF_INVALID;
				5085	if (unlikely(error_code & PFERR_RSVD_MASK)) {
				5086	r = handle_mmio_page_fault(vcpu, cr2, direct);
				5087	if (r == RET_PF_EMULATE) {
				5088	emulation_type = 0;
				5089	goto emulate;
				5090	}
				5091	}
				5092
				5093	if (r == RET_PF_INVALID) {
				5094	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
				5095	false);
				5096	WARN_ON(r == RET_PF_INVALID);
				5097	}
				5098
				5099	if (r == RET_PF_RETRY)
				5100	return 1;
				5101	if (r < 0)
				5102	return r;
				5103
				5104	/*
				5105	* Before emulating the instruction, check if the error code
				5106	* was due to a RO violation while translating the guest page.
				5107	* This can occur when using nested virtualization with nested
				5108	* paging in both guests. If true, we simply unprotect the page
				5109	* and resume the guest.
				5110	*/
				5111	if (vcpu->arch.mmu.direct_map &&
				5112	(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
				5113	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
				5114	return 1;
				5115	}
				5116
				5117	if (mmio_info_in_cache(vcpu, cr2, direct))
				5118	emulation_type = 0;
				5119	emulate:
				5120	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
				5121
				5122	switch (er) {
				5123	case EMULATE_DONE:
				5124	return 1;
				5125	case EMULATE_USER_EXIT:
				5126	++vcpu->stat.mmio_exits;
				5127	/* fall through */
				5128	case EMULATE_FAIL:
				5129	return 0;
				5130	default:
				5131	BUG();
				5132	}
				5133	}
				5134	EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
				5135
				5136	void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
				5137	{
				5138	vcpu->arch.mmu.invlpg(vcpu, gva);
				5139	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				5140	++vcpu->stat.invlpg;
				5141	}
				5142	EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
				5143
				5144	void kvm_enable_tdp(void)
				5145	{
				5146	tdp_enabled = true;
				5147	}
				5148	EXPORT_SYMBOL_GPL(kvm_enable_tdp);
				5149
				5150	void kvm_disable_tdp(void)
				5151	{
				5152	tdp_enabled = false;
				5153	}
				5154	EXPORT_SYMBOL_GPL(kvm_disable_tdp);
				5155
				5156	static void free_mmu_pages(struct kvm_vcpu *vcpu)
				5157	{
				5158	free_page((unsigned long)vcpu->arch.mmu.pae_root);
				5159	if (vcpu->arch.mmu.lm_root != NULL)
				5160	free_page((unsigned long)vcpu->arch.mmu.lm_root);
				5161	}
				5162
				5163	static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
				5164	{
				5165	struct page *page;
				5166	int i;
				5167
				5168	/*
				5169	* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
				5170	* Therefore we need to allocate shadow page tables in the first
				5171	* 4GB of memory, which happens to fit the DMA32 zone.
				5172	*/
				5173	page = alloc_page(GFP_KERNEL \| __GFP_DMA32);
				5174	if (!page)
				5175	return -ENOMEM;
				5176
				5177	vcpu->arch.mmu.pae_root = page_address(page);
				5178	for (i = 0; i < 4; ++i)
				5179	vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
				5180
				5181	return 0;
				5182	}
				5183
				5184	int kvm_mmu_create(struct kvm_vcpu *vcpu)
				5185	{
				5186	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
				5187	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				5188	vcpu->arch.mmu.translate_gpa = translate_gpa;
				5189	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
				5190
				5191	return alloc_mmu_pages(vcpu);
				5192	}
				5193
				5194	void kvm_mmu_setup(struct kvm_vcpu *vcpu)
				5195	{
				5196	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
				5197
				5198	init_kvm_mmu(vcpu);
				5199	}
				5200
				5201	static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
				5202	struct kvm_memory_slot *slot,
				5203	struct kvm_page_track_notifier_node *node)
				5204	{
				5205	kvm_mmu_invalidate_zap_all_pages(kvm);
				5206	}
				5207
				5208	void kvm_mmu_init_vm(struct kvm *kvm)
				5209	{
				5210	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
				5211
				5212	node->track_write = kvm_mmu_pte_write;
				5213	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
				5214	kvm_page_track_register_notifier(kvm, node);
				5215	}
				5216
				5217	void kvm_mmu_uninit_vm(struct kvm *kvm)
				5218	{
				5219	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
				5220
				5221	kvm_page_track_unregister_notifier(kvm, node);
				5222	}
				5223
				5224	/* The return value indicates if tlb flush on all vcpus is needed. */
				5225	typedef bool (slot_level_handler) (struct kvm kvm, struct kvm_rmap_head *rmap_head);
				5226
				5227	/* The caller should hold mmu-lock before calling this function. */
				5228	static __always_inline bool
				5229	slot_handle_level_range(struct kvm kvm, struct kvm_memory_slot memslot,
				5230	slot_level_handler fn, int start_level, int end_level,
				5231	gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
				5232	{
				5233	struct slot_rmap_walk_iterator iterator;
				5234	bool flush = false;
				5235
				5236	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
				5237	end_gfn, &iterator) {
				5238	if (iterator.rmap)
				5239	flush \|= fn(kvm, iterator.rmap);
				5240
				5241	if (need_resched() \|\| spin_needbreak(&kvm->mmu_lock)) {
				5242	if (flush && lock_flush_tlb) {
				5243	kvm_flush_remote_tlbs(kvm);
				5244	flush = false;
				5245	}
				5246	cond_resched_lock(&kvm->mmu_lock);
				5247	}
				5248	}
				5249
				5250	if (flush && lock_flush_tlb) {
				5251	kvm_flush_remote_tlbs(kvm);
				5252	flush = false;
				5253	}
				5254
				5255	return flush;
				5256	}
				5257
				5258	static __always_inline bool
				5259	slot_handle_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5260	slot_level_handler fn, int start_level, int end_level,
				5261	bool lock_flush_tlb)
				5262	{
				5263	return slot_handle_level_range(kvm, memslot, fn, start_level,
				5264	end_level, memslot->base_gfn,
				5265	memslot->base_gfn + memslot->npages - 1,
				5266	lock_flush_tlb);
				5267	}
				5268
				5269	static __always_inline bool
				5270	slot_handle_all_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5271	slot_level_handler fn, bool lock_flush_tlb)
				5272	{
				5273	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
				5274	PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
				5275	}
				5276
				5277	static __always_inline bool
				5278	slot_handle_large_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5279	slot_level_handler fn, bool lock_flush_tlb)
				5280	{
				5281	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
				5282	PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
				5283	}
				5284
				5285	static __always_inline bool
				5286	slot_handle_leaf(struct kvm kvm, struct kvm_memory_slot memslot,
				5287	slot_level_handler fn, bool lock_flush_tlb)
				5288	{
				5289	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
				5290	PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
				5291	}
				5292
				5293	void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
				5294	{
				5295	struct kvm_memslots *slots;
				5296	struct kvm_memory_slot *memslot;
				5297	int i;
				5298
				5299	spin_lock(&kvm->mmu_lock);
				5300	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				5301	slots = __kvm_memslots(kvm, i);
				5302	kvm_for_each_memslot(memslot, slots) {
				5303	gfn_t start, end;
				5304
				5305	start = max(gfn_start, memslot->base_gfn);
				5306	end = min(gfn_end, memslot->base_gfn + memslot->npages);
				5307	if (start >= end)
				5308	continue;
				5309
				5310	slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
				5311	PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
				5312	start, end - 1, true);
				5313	}
				5314	}
				5315
				5316	spin_unlock(&kvm->mmu_lock);
				5317	}
				5318
				5319	static bool slot_rmap_write_protect(struct kvm *kvm,
				5320	struct kvm_rmap_head *rmap_head)
				5321	{
				5322	return __rmap_write_protect(kvm, rmap_head, false);
				5323	}
				5324
				5325	void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
				5326	struct kvm_memory_slot *memslot)
				5327	{
				5328	bool flush;
				5329
				5330	spin_lock(&kvm->mmu_lock);
				5331	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
				5332	false);
				5333	spin_unlock(&kvm->mmu_lock);
				5334
				5335	/*
				5336	* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
				5337	* which do tlb flush out of mmu-lock should be serialized by
				5338	* kvm->slots_lock otherwise tlb flush would be missed.
				5339	*/
				5340	lockdep_assert_held(&kvm->slots_lock);
				5341
				5342	/*
				5343	* We can flush all the TLBs out of the mmu lock without TLB
				5344	* corruption since we just change the spte from writable to
				5345	* readonly so that we only need to care the case of changing
				5346	* spte from present to present (changing the spte from present
				5347	* to nonpresent will flush all the TLBs immediately), in other
				5348	* words, the only case we care is mmu_spte_update() where we
				5349	* haved checked SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE
				5350	* instead of PT_WRITABLE_MASK, that means it does not depend
				5351	* on PT_WRITABLE_MASK anymore.
				5352	*/
				5353	if (flush)
				5354	kvm_flush_remote_tlbs(kvm);
				5355	}
				5356
				5357	static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
				5358	struct kvm_rmap_head *rmap_head)
				5359	{
				5360	u64 *sptep;
				5361	struct rmap_iterator iter;
				5362	int need_tlb_flush = 0;
				5363	kvm_pfn_t pfn;
				5364	struct kvm_mmu_page *sp;
				5365
				5366	restart:
				5367	for_each_rmap_spte(rmap_head, &iter, sptep) {
				5368	sp = page_header(__pa(sptep));
				5369	pfn = spte_to_pfn(*sptep);
				5370
				5371	/*
				5372	* We cannot do huge page mapping for indirect shadow pages,
				5373	* which are found on the last rmap (level = 1) when not using
				5374	* tdp; such shadow pages are synced with the page table in
				5375	* the guest, and the guest page table is using 4K page size
				5376	* mapping if the indirect sp has level = 1.
				5377	*/
				5378	if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
				5379	!kvm_is_zone_device_pfn(pfn) &&
				5380	PageTransCompoundMap(pfn_to_page(pfn))) {
				5381	drop_spte(kvm, sptep);
				5382	need_tlb_flush = 1;
				5383	goto restart;
				5384	}
				5385	}
				5386
				5387	return need_tlb_flush;
				5388	}
				5389
				5390	void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
				5391	const struct kvm_memory_slot *memslot)
				5392	{
				5393	/* FIXME: const-ify all uses of struct kvm_memory_slot. */
				5394	spin_lock(&kvm->mmu_lock);
				5395	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
				5396	kvm_mmu_zap_collapsible_spte, true);
				5397	spin_unlock(&kvm->mmu_lock);
				5398	}
				5399
				5400	void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
				5401	struct kvm_memory_slot *memslot)
				5402	{
				5403	bool flush;
				5404
				5405	spin_lock(&kvm->mmu_lock);
				5406	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
				5407	spin_unlock(&kvm->mmu_lock);
				5408
				5409	lockdep_assert_held(&kvm->slots_lock);
				5410
				5411	/*
				5412	* It's also safe to flush TLBs out of mmu lock here as currently this
				5413	* function is only used for dirty logging, in which case flushing TLB
				5414	* out of mmu lock also guarantees no dirty pages will be lost in
				5415	* dirty_bitmap.
				5416	*/
				5417	if (flush)
				5418	kvm_flush_remote_tlbs(kvm);
				5419	}
				5420	EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
				5421
				5422	void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
				5423	struct kvm_memory_slot *memslot)
				5424	{
				5425	bool flush;
				5426
				5427	spin_lock(&kvm->mmu_lock);
				5428	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
				5429	false);
				5430	spin_unlock(&kvm->mmu_lock);
				5431
				5432	/* see kvm_mmu_slot_remove_write_access */
				5433	lockdep_assert_held(&kvm->slots_lock);
				5434
				5435	if (flush)
				5436	kvm_flush_remote_tlbs(kvm);
				5437	}
				5438	EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
				5439
				5440	void kvm_mmu_slot_set_dirty(struct kvm *kvm,
				5441	struct kvm_memory_slot *memslot)
				5442	{
				5443	bool flush;
				5444
				5445	spin_lock(&kvm->mmu_lock);
				5446	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
				5447	spin_unlock(&kvm->mmu_lock);
				5448
				5449	lockdep_assert_held(&kvm->slots_lock);
				5450
				5451	/* see kvm_mmu_slot_leaf_clear_dirty */
				5452	if (flush)
				5453	kvm_flush_remote_tlbs(kvm);
				5454	}
				5455	EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
				5456
				5457	#define BATCH_ZAP_PAGES 10
				5458	static void kvm_zap_obsolete_pages(struct kvm *kvm)
				5459	{
				5460	struct kvm_mmu_page sp, node;
				5461	int batch = 0;
				5462
				5463	restart:
				5464	list_for_each_entry_safe_reverse(sp, node,
				5465	&kvm->arch.active_mmu_pages, link) {
				5466	int ret;
				5467
				5468	/*
				5469	* No obsolete page exists before new created page since
				5470	* active_mmu_pages is the FIFO list.
				5471	*/
				5472	if (!is_obsolete_sp(kvm, sp))
				5473	break;
				5474
				5475	/*
				5476	* Since we are reversely walking the list and the invalid
				5477	* list will be moved to the head, skip the invalid page
				5478	* can help us to avoid the infinity list walking.
				5479	*/
				5480	if (sp->role.invalid)
				5481	continue;
				5482
				5483	/*
				5484	* Need not flush tlb since we only zap the sp with invalid
				5485	* generation number.
				5486	*/
				5487	if (batch >= BATCH_ZAP_PAGES &&
				5488	cond_resched_lock(&kvm->mmu_lock)) {
				5489	batch = 0;
				5490	goto restart;
				5491	}
				5492
				5493	ret = kvm_mmu_prepare_zap_page(kvm, sp,
				5494	&kvm->arch.zapped_obsolete_pages);
				5495	batch += ret;
				5496
				5497	if (ret)
				5498	goto restart;
				5499	}
				5500
				5501	/*
				5502	* Should flush tlb before free page tables since lockless-walking
				5503	* may use the pages.
				5504	*/
				5505	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
				5506	}
				5507
				5508	/*
				5509	* Fast invalidate all shadow pages and use lock-break technique
				5510	* to zap obsolete pages.
				5511	*
				5512	* It's required when memslot is being deleted or VM is being
				5513	* destroyed, in these cases, we should ensure that KVM MMU does
				5514	* not use any resource of the being-deleted slot or all slots
				5515	* after calling the function.
				5516	*/
				5517	void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
				5518	{
				5519	spin_lock(&kvm->mmu_lock);
				5520	trace_kvm_mmu_invalidate_zap_all_pages(kvm);
				5521	kvm->arch.mmu_valid_gen++;
				5522
				5523	/*
				5524	* Notify all vcpus to reload its shadow page table
				5525	* and flush TLB. Then all vcpus will switch to new
				5526	* shadow page table with the new mmu_valid_gen.
				5527	*
				5528	* Note: we should do this under the protection of
				5529	* mmu-lock, otherwise, vcpu would purge shadow page
				5530	* but miss tlb flush.
				5531	*/
				5532	kvm_reload_remote_mmus(kvm);
				5533
				5534	kvm_zap_obsolete_pages(kvm);
				5535	spin_unlock(&kvm->mmu_lock);
				5536	}
				5537
				5538	static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
				5539	{
				5540	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
				5541	}
				5542
				5543	void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
				5544	{
				5545	gen &= MMIO_GEN_MASK;
				5546
				5547	/*
				5548	* Shift to eliminate the "update in-progress" flag, which isn't
				5549	* included in the spte's generation number.
				5550	*/
				5551	gen >>= 1;
				5552
				5553	/*
				5554	* Generation numbers are incremented in multiples of the number of
				5555	* address spaces in order to provide unique generations across all
				5556	* address spaces. Strip what is effectively the address space
				5557	* modifier prior to checking for a wrap of the MMIO generation so
				5558	* that a wrap in any address space is detected.
				5559	*/
				5560	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
				5561
				5562	/*
				5563	* The very rare case: if the MMIO generation number has wrapped,
				5564	* zap all shadow pages.
				5565	*/
				5566	if (unlikely(gen == 0)) {
				5567	kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
				5568	kvm_mmu_invalidate_zap_all_pages(kvm);
				5569	}
				5570	}
				5571
				5572	static unsigned long
				5573	mmu_shrink_scan(struct shrinker shrink, struct shrink_control sc)
				5574	{
				5575	struct kvm *kvm;
				5576	int nr_to_scan = sc->nr_to_scan;
				5577	unsigned long freed = 0;
				5578
				5579	mutex_lock(&kvm_lock);
				5580
				5581	list_for_each_entry(kvm, &vm_list, vm_list) {
				5582	int idx;
				5583	LIST_HEAD(invalid_list);
				5584
				5585	/*
				5586	* Never scan more than sc->nr_to_scan VM instances.
				5587	* Will not hit this condition practically since we do not try
				5588	* to shrink more than one VM and it is very unlikely to see
				5589	* !n_used_mmu_pages so many times.
				5590	*/
				5591	if (!nr_to_scan--)
				5592	break;
				5593	/*
				5594	* n_used_mmu_pages is accessed without holding kvm->mmu_lock
				5595	* here. We may skip a VM instance errorneosly, but we do not
				5596	* want to shrink a VM that only started to populate its MMU
				5597	* anyway.
				5598	*/
				5599	if (!kvm->arch.n_used_mmu_pages &&
				5600	!kvm_has_zapped_obsolete_pages(kvm))
				5601	continue;
				5602
				5603	idx = srcu_read_lock(&kvm->srcu);
				5604	spin_lock(&kvm->mmu_lock);
				5605
				5606	if (kvm_has_zapped_obsolete_pages(kvm)) {
				5607	kvm_mmu_commit_zap_page(kvm,
				5608	&kvm->arch.zapped_obsolete_pages);
				5609	goto unlock;
				5610	}
				5611
				5612	if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
				5613	freed++;
				5614	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				5615
				5616	unlock:
				5617	spin_unlock(&kvm->mmu_lock);
				5618	srcu_read_unlock(&kvm->srcu, idx);
				5619
				5620	/*
				5621	* unfair on small ones
				5622	* per-vm shrinkers cry out
				5623	* sadness comes quickly
				5624	*/
				5625	list_move_tail(&kvm->vm_list, &vm_list);
				5626	break;
				5627	}
				5628
				5629	mutex_unlock(&kvm_lock);
				5630	return freed;
				5631	}
				5632
				5633	static unsigned long
				5634	mmu_shrink_count(struct shrinker shrink, struct shrink_control sc)
				5635	{
				5636	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
				5637	}
				5638
				5639	static struct shrinker mmu_shrinker = {
				5640	.count_objects = mmu_shrink_count,
				5641	.scan_objects = mmu_shrink_scan,
				5642	.seeks = DEFAULT_SEEKS * 10,
				5643	};
				5644
				5645	static void mmu_destroy_caches(void)
				5646	{
				5647	if (pte_list_desc_cache)
				5648	kmem_cache_destroy(pte_list_desc_cache);
				5649	if (mmu_page_header_cache)
				5650	kmem_cache_destroy(mmu_page_header_cache);
				5651	}
				5652
				5653	static bool get_nx_auto_mode(void)
				5654	{
				5655	/* Return true when CPU has the bug, and mitigations are ON */
				5656	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
				5657	}
				5658
				5659	static void __set_nx_huge_pages(bool val)
				5660	{
				5661	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
				5662	}
				5663
				5664	static int set_nx_huge_pages(const char val, const struct kernel_param kp)
				5665	{
				5666	bool old_val = nx_huge_pages;
				5667	bool new_val;
				5668
				5669	/* In "auto" mode deploy workaround only if CPU has the bug. */
				5670	if (sysfs_streq(val, "off"))
				5671	new_val = 0;
				5672	else if (sysfs_streq(val, "force"))
				5673	new_val = 1;
				5674	else if (sysfs_streq(val, "auto"))
				5675	new_val = get_nx_auto_mode();
				5676	else if (strtobool(val, &new_val) < 0)
				5677	return -EINVAL;
				5678
				5679	__set_nx_huge_pages(new_val);
				5680
				5681	if (new_val != old_val) {
				5682	struct kvm *kvm;
				5683	int idx;
				5684
				5685	mutex_lock(&kvm_lock);
				5686
				5687	list_for_each_entry(kvm, &vm_list, vm_list) {
				5688	idx = srcu_read_lock(&kvm->srcu);
				5689	kvm_mmu_invalidate_zap_all_pages(kvm);
				5690	srcu_read_unlock(&kvm->srcu, idx);
				5691
				5692	wake_up_process(kvm->arch.nx_lpage_recovery_thread);
				5693	}
				5694	mutex_unlock(&kvm_lock);
				5695	}
				5696
				5697	return 0;
				5698	}
				5699
				5700	static void kvm_set_mmio_spte_mask(void)
				5701	{
				5702	u64 mask;
				5703
				5704	/*
				5705	* Set a reserved PA bit in MMIO SPTEs to generate page faults with
				5706	* PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
				5707	* paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
				5708	* 52-bit physical addresses then there are no reserved PA bits in the
				5709	* PTEs and so the reserved PA approach must be disabled.
				5710	*/
				5711	if (shadow_phys_bits < 52)
				5712	mask = BIT_ULL(51) \| PT_PRESENT_MASK;
				5713	else
				5714	mask = 0;
				5715
				5716	kvm_mmu_set_mmio_spte_mask(mask, mask);
				5717	}
				5718
				5719	int kvm_mmu_module_init(void)
				5720	{
				5721	if (nx_huge_pages == -1)
				5722	__set_nx_huge_pages(get_nx_auto_mode());
				5723
				5724	kvm_mmu_reset_all_pte_masks();
				5725
				5726	kvm_set_mmio_spte_mask();
				5727
				5728	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
				5729	sizeof(struct pte_list_desc),
				5730	0, SLAB_ACCOUNT, NULL);
				5731	if (!pte_list_desc_cache)
				5732	goto nomem;
				5733
				5734	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
				5735	sizeof(struct kvm_mmu_page),
				5736	0, SLAB_ACCOUNT, NULL);
				5737	if (!mmu_page_header_cache)
				5738	goto nomem;
				5739
				5740	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
				5741	goto nomem;
				5742
				5743	register_shrinker(&mmu_shrinker);
				5744
				5745	return 0;
				5746
				5747	nomem:
				5748	mmu_destroy_caches();
				5749	return -ENOMEM;
				5750	}
				5751
				5752	/*
				5753	* Caculate mmu pages needed for kvm.
				5754	*/
				5755	unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
				5756	{
				5757	unsigned int nr_mmu_pages;
				5758	unsigned int nr_pages = 0;
				5759	struct kvm_memslots *slots;
				5760	struct kvm_memory_slot *memslot;
				5761	int i;
				5762
				5763	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				5764	slots = __kvm_memslots(kvm, i);
				5765
				5766	kvm_for_each_memslot(memslot, slots)
				5767	nr_pages += memslot->npages;
				5768	}
				5769
				5770	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
				5771	nr_mmu_pages = max(nr_mmu_pages,
				5772	(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
				5773
				5774	return nr_mmu_pages;
				5775	}
				5776
				5777	void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
				5778	{
				5779	kvm_mmu_unload(vcpu);
				5780	free_mmu_pages(vcpu);
				5781	mmu_free_memory_caches(vcpu);
				5782	}
				5783
				5784	void kvm_mmu_module_exit(void)
				5785	{
				5786	mmu_destroy_caches();
				5787	percpu_counter_destroy(&kvm_total_used_mmu_pages);
				5788	unregister_shrinker(&mmu_shrinker);
				5789	mmu_audit_disable();
				5790	}
				5791
				5792	static int set_nx_huge_pages_recovery_ratio(const char val, const struct kernel_param kp)
				5793	{
				5794	unsigned int old_val;
				5795	int err;
				5796
				5797	old_val = nx_huge_pages_recovery_ratio;
				5798	err = param_set_uint(val, kp);
				5799	if (err)
				5800	return err;
				5801
				5802	if (READ_ONCE(nx_huge_pages) &&
				5803	!old_val && nx_huge_pages_recovery_ratio) {
				5804	struct kvm *kvm;
				5805
				5806	mutex_lock(&kvm_lock);
				5807
				5808	list_for_each_entry(kvm, &vm_list, vm_list)
				5809	wake_up_process(kvm->arch.nx_lpage_recovery_thread);
				5810
				5811	mutex_unlock(&kvm_lock);
				5812	}
				5813
				5814	return err;
				5815	}
				5816
				5817	static void kvm_recover_nx_lpages(struct kvm *kvm)
				5818	{
				5819	int rcu_idx;
				5820	struct kvm_mmu_page *sp;
				5821	unsigned int ratio;
				5822	LIST_HEAD(invalid_list);
				5823	ulong to_zap;
				5824
				5825	rcu_idx = srcu_read_lock(&kvm->srcu);
				5826	spin_lock(&kvm->mmu_lock);
				5827
				5828	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
				5829	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
				5830	while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
				5831	/*
				5832	* We use a separate list instead of just using active_mmu_pages
				5833	* because the number of lpage_disallowed pages is expected to
				5834	* be relatively small compared to the total.
				5835	*/
				5836	sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
				5837	struct kvm_mmu_page,
				5838	lpage_disallowed_link);
				5839	WARN_ON_ONCE(!sp->lpage_disallowed);
				5840	kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
				5841	WARN_ON_ONCE(sp->lpage_disallowed);
				5842
				5843	if (!--to_zap \|\| need_resched() \|\| spin_needbreak(&kvm->mmu_lock)) {
				5844	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				5845	if (to_zap)
				5846	cond_resched_lock(&kvm->mmu_lock);
				5847	}
				5848	}
				5849
				5850	spin_unlock(&kvm->mmu_lock);
				5851	srcu_read_unlock(&kvm->srcu, rcu_idx);
				5852	}
				5853
				5854	static long get_nx_lpage_recovery_timeout(u64 start_time)
				5855	{
				5856	return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
				5857	? start_time + 60 * HZ - get_jiffies_64()
				5858	: MAX_SCHEDULE_TIMEOUT;
				5859	}
				5860
				5861	static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
				5862	{
				5863	u64 start_time;
				5864	long remaining_time;
				5865
				5866	while (true) {
				5867	start_time = get_jiffies_64();
				5868	remaining_time = get_nx_lpage_recovery_timeout(start_time);
				5869
				5870	set_current_state(TASK_INTERRUPTIBLE);
				5871	while (!kthread_should_stop() && remaining_time > 0) {
				5872	schedule_timeout(remaining_time);
				5873	remaining_time = get_nx_lpage_recovery_timeout(start_time);
				5874	set_current_state(TASK_INTERRUPTIBLE);
				5875	}
				5876
				5877	set_current_state(TASK_RUNNING);
				5878
				5879	if (kthread_should_stop())
				5880	return 0;
				5881
				5882	kvm_recover_nx_lpages(kvm);
				5883	}
				5884	}
				5885
				5886	int kvm_mmu_post_init_vm(struct kvm *kvm)
				5887	{
				5888	int err;
				5889
				5890	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
				5891	"kvm-nx-lpage-recovery",
				5892	&kvm->arch.nx_lpage_recovery_thread);
				5893	if (!err)
				5894	kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
				5895
				5896	return err;
				5897	}
				5898
				5899	void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
				5900	{
				5901	if (kvm->arch.nx_lpage_recovery_thread)
				5902	kthread_stop(kvm->arch.nx_lpage_recovery_thread);
				5903	}