Blame - src/kernel/linux/v4.19/arch/x86/kvm/mmu.c - T800

blob: eddf91a0e363eab11933ed8e2b3bd01e956be982 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* This module enables machines with Intel VT-x extensions to run virtual
				5	* machines without emulation or binary translation.
				6	*
				7	* MMU support
				8	*
				9	* Copyright (C) 2006 Qumranet, Inc.
				10	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				11	*
				12	* Authors:
				13	* Yaniv Kamay <yaniv@qumranet.com>
				14	* Avi Kivity <avi@qumranet.com>
				15	*
				16	* This work is licensed under the terms of the GNU GPL, version 2. See
				17	* the COPYING file in the top-level directory.
				18	*
				19	*/
				20
				21	#include "irq.h"
				22	#include "mmu.h"
				23	#include "x86.h"
				24	#include "kvm_cache_regs.h"
				25	#include "cpuid.h"
				26
				27	#include <linux/kvm_host.h>
				28	#include <linux/types.h>
				29	#include <linux/string.h>
				30	#include <linux/mm.h>
				31	#include <linux/highmem.h>
				32	#include <linux/moduleparam.h>
				33	#include <linux/export.h>
				34	#include <linux/swap.h>
				35	#include <linux/hugetlb.h>
				36	#include <linux/compiler.h>
				37	#include <linux/srcu.h>
				38	#include <linux/slab.h>
				39	#include <linux/sched/signal.h>
				40	#include <linux/uaccess.h>
				41	#include <linux/hash.h>
				42	#include <linux/kern_levels.h>
				43	#include <linux/kthread.h>
				44
				45	#include <asm/page.h>
				46	#include <asm/pat.h>
				47	#include <asm/cmpxchg.h>
				48	#include <asm/io.h>
				49	#include <asm/vmx.h>
				50	#include <asm/kvm_page_track.h>
				51	#include "trace.h"
				52
				53	extern bool itlb_multihit_kvm_mitigation;
				54
				55	static int __read_mostly nx_huge_pages = -1;
				56	static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
				57
				58	static int set_nx_huge_pages(const char val, const struct kernel_param kp);
				59	static int set_nx_huge_pages_recovery_ratio(const char val, const struct kernel_param kp);
				60
				61	static struct kernel_param_ops nx_huge_pages_ops = {
				62	.set = set_nx_huge_pages,
				63	.get = param_get_bool,
				64	};
				65
				66	static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
				67	.set = set_nx_huge_pages_recovery_ratio,
				68	.get = param_get_uint,
				69	};
				70
				71	module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
				72	__MODULE_PARM_TYPE(nx_huge_pages, "bool");
				73	module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
				74	&nx_huge_pages_recovery_ratio, 0644);
				75	__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
				76
				77	/*
				78	* When setting this variable to true it enables Two-Dimensional-Paging
				79	* where the hardware walks 2 page tables:
				80	* 1. the guest-virtual to guest-physical
				81	* 2. while doing 1. it walks guest-physical to host-physical
				82	* If the hardware supports that we don't need to do shadow paging.
				83	*/
				84	bool tdp_enabled = false;
				85
				86	enum {
				87	AUDIT_PRE_PAGE_FAULT,
				88	AUDIT_POST_PAGE_FAULT,
				89	AUDIT_PRE_PTE_WRITE,
				90	AUDIT_POST_PTE_WRITE,
				91	AUDIT_PRE_SYNC,
				92	AUDIT_POST_SYNC
				93	};
				94
				95	#undef MMU_DEBUG
				96
				97	#ifdef MMU_DEBUG
				98	static bool dbg = 0;
				99	module_param(dbg, bool, 0644);
				100
				101	#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
				102	#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
				103	#define MMU_WARN_ON(x) WARN_ON(x)
				104	#else
				105	#define pgprintk(x...) do { } while (0)
				106	#define rmap_printk(x...) do { } while (0)
				107	#define MMU_WARN_ON(x) do { } while (0)
				108	#endif
				109
				110	#define PTE_PREFETCH_NUM 8
				111
				112	#define PT_FIRST_AVAIL_BITS_SHIFT 10
				113	#define PT64_SECOND_AVAIL_BITS_SHIFT 52
				114
				115	#define PT64_LEVEL_BITS 9
				116
				117	#define PT64_LEVEL_SHIFT(level) \
				118	(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
				119
				120	#define PT64_INDEX(address, level)\
				121	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
				122
				123
				124	#define PT32_LEVEL_BITS 10
				125
				126	#define PT32_LEVEL_SHIFT(level) \
				127	(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
				128
				129	#define PT32_LVL_OFFSET_MASK(level) \
				130	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
				131	* PT32_LEVEL_BITS))) - 1))
				132
				133	#define PT32_INDEX(address, level)\
				134	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
				135
				136
				137	#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
				138	#define PT64_DIR_BASE_ADDR_MASK \
				139	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
				140	#define PT64_LVL_ADDR_MASK(level) \
				141	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
				142	* PT64_LEVEL_BITS))) - 1))
				143	#define PT64_LVL_OFFSET_MASK(level) \
				144	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
				145	* PT64_LEVEL_BITS))) - 1))
				146
				147	#define PT32_BASE_ADDR_MASK PAGE_MASK
				148	#define PT32_DIR_BASE_ADDR_MASK \
				149	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
				150	#define PT32_LVL_ADDR_MASK(level) \
				151	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
				152	* PT32_LEVEL_BITS))) - 1))
				153
				154	#define PT64_PERM_MASK (PT_PRESENT_MASK \| PT_WRITABLE_MASK \| shadow_user_mask \
				155	\| shadow_x_mask \| shadow_nx_mask \| shadow_me_mask)
				156
				157	#define ACC_EXEC_MASK 1
				158	#define ACC_WRITE_MASK PT_WRITABLE_MASK
				159	#define ACC_USER_MASK PT_USER_MASK
				160	#define ACC_ALL (ACC_EXEC_MASK \| ACC_WRITE_MASK \| ACC_USER_MASK)
				161
				162	/* The mask for the R/X bits in EPT PTEs */
				163	#define PT64_EPT_READABLE_MASK 0x1ull
				164	#define PT64_EPT_EXECUTABLE_MASK 0x4ull
				165
				166	#include <trace/events/kvm.h>
				167
				168	#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
				169	#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
				170
				171	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
				172
				173	/* make pte_list_desc fit well in cache line */
				174	#define PTE_LIST_EXT 3
				175
				176	/*
				177	* Return values of handle_mmio_page_fault and mmu.page_fault:
				178	* RET_PF_RETRY: let CPU fault again on the address.
				179	* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
				180	*
				181	* For handle_mmio_page_fault only:
				182	* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
				183	*/
				184	enum {
				185	RET_PF_RETRY = 0,
				186	RET_PF_EMULATE = 1,
				187	RET_PF_INVALID = 2,
				188	};
				189
				190	struct pte_list_desc {
				191	u64 *sptes[PTE_LIST_EXT];
				192	struct pte_list_desc *more;
				193	};
				194
				195	struct kvm_shadow_walk_iterator {
				196	u64 addr;
				197	hpa_t shadow_addr;
				198	u64 *sptep;
				199	int level;
				200	unsigned index;
				201	};
				202
				203	static const union kvm_mmu_page_role mmu_base_role_mask = {
				204	.cr0_wp = 1,
				205	.cr4_pae = 1,
				206	.nxe = 1,
				207	.smep_andnot_wp = 1,
				208	.smap_andnot_wp = 1,
				209	.smm = 1,
				210	.guest_mode = 1,
				211	.ad_disabled = 1,
				212	};
				213
				214	#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
				215	for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
				216	(_root), (_addr)); \
				217	shadow_walk_okay(&(_walker)); \
				218	shadow_walk_next(&(_walker)))
				219
				220	#define for_each_shadow_entry(_vcpu, _addr, _walker) \
				221	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
				222	shadow_walk_okay(&(_walker)); \
				223	shadow_walk_next(&(_walker)))
				224
				225	#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
				226	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
				227	shadow_walk_okay(&(_walker)) && \
				228	({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
				229	__shadow_walk_next(&(_walker), spte))
				230
				231	static struct kmem_cache *pte_list_desc_cache;
				232	static struct kmem_cache *mmu_page_header_cache;
				233	static struct percpu_counter kvm_total_used_mmu_pages;
				234
				235	static u64 __read_mostly shadow_nx_mask;
				236	static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
				237	static u64 __read_mostly shadow_user_mask;
				238	static u64 __read_mostly shadow_accessed_mask;
				239	static u64 __read_mostly shadow_dirty_mask;
				240	static u64 __read_mostly shadow_mmio_mask;
				241	static u64 __read_mostly shadow_mmio_value;
				242	static u64 __read_mostly shadow_present_mask;
				243	static u64 __read_mostly shadow_me_mask;
				244
				245	/*
				246	* SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
				247	* Non-present SPTEs with shadow_acc_track_value set are in place for access
				248	* tracking.
				249	*/
				250	static u64 __read_mostly shadow_acc_track_mask;
				251	static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
				252
				253	/*
				254	* The mask/shift to use for saving the original R/X bits when marking the PTE
				255	* as not-present for access tracking purposes. We do not save the W bit as the
				256	* PTEs being access tracked also need to be dirty tracked, so the W bit will be
				257	* restored only when a write is attempted to the page.
				258	*/
				259	static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK \|
				260	PT64_EPT_EXECUTABLE_MASK;
				261	static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
				262
				263	/*
				264	* This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
				265	* to guard against L1TF attacks.
				266	*/
				267	static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
				268
				269	/*
				270	* The number of high-order 1 bits to use in the mask above.
				271	*/
				272	static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
				273
				274	/*
				275	* In some cases, we need to preserve the GFN of a non-present or reserved
				276	* SPTE when we usurp the upper five bits of the physical address space to
				277	* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
				278	* shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
				279	* left into the reserved bits, i.e. the GFN in the SPTE will be split into
				280	* high and low parts. This mask covers the lower bits of the GFN.
				281	*/
				282	static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
				283
				284
				285	static void mmu_spte_set(u64 *sptep, u64 spte);
				286	static bool is_executable_pte(u64 spte);
				287	static union kvm_mmu_page_role
				288	kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
				289
				290	#define CREATE_TRACE_POINTS
				291	#include "mmutrace.h"
				292
				293
				294	void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
				295	{
				296	BUG_ON((mmio_mask & mmio_value) != mmio_value);
				297	shadow_mmio_value = mmio_value \| SPTE_SPECIAL_MASK;
				298	shadow_mmio_mask = mmio_mask \| SPTE_SPECIAL_MASK;
				299	}
				300	EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
				301
				302	static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
				303	{
				304	return sp->role.ad_disabled;
				305	}
				306
				307	static inline bool spte_ad_enabled(u64 spte)
				308	{
				309	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
				310	return !(spte & shadow_acc_track_value);
				311	}
				312
				313	static bool is_nx_huge_page_enabled(void)
				314	{
				315	return READ_ONCE(nx_huge_pages);
				316	}
				317
				318	static inline u64 spte_shadow_accessed_mask(u64 spte)
				319	{
				320	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
				321	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
				322	}
				323
				324	static inline u64 spte_shadow_dirty_mask(u64 spte)
				325	{
				326	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
				327	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
				328	}
				329
				330	static inline bool is_access_track_spte(u64 spte)
				331	{
				332	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
				333	}
				334
				335	/*
				336	* the low bit of the generation number is always presumed to be zero.
				337	* This disables mmio caching during memslot updates. The concept is
				338	* similar to a seqcount but instead of retrying the access we just punt
				339	* and ignore the cache.
				340	*
				341	* spte bits 3-11 are used as bits 1-9 of the generation number,
				342	* the bits 52-61 are used as bits 10-19 of the generation number.
				343	*/
				344	#define MMIO_SPTE_GEN_LOW_SHIFT 2
				345	#define MMIO_SPTE_GEN_HIGH_SHIFT 52
				346
				347	#define MMIO_GEN_SHIFT 20
				348	#define MMIO_GEN_LOW_SHIFT 10
				349	#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
				350	#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
				351
				352	static u64 generation_mmio_spte_mask(unsigned int gen)
				353	{
				354	u64 mask;
				355
				356	WARN_ON(gen & ~MMIO_GEN_MASK);
				357
				358	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
				359	mask \|= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
				360	return mask;
				361	}
				362
				363	static unsigned int get_mmio_spte_generation(u64 spte)
				364	{
				365	unsigned int gen;
				366
				367	spte &= ~shadow_mmio_mask;
				368
				369	gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
				370	gen \|= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
				371	return gen;
				372	}
				373
				374	static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
				375	{
				376	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
				377	}
				378
				379	static void mark_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, u64 gfn,
				380	unsigned access)
				381	{
				382	unsigned int gen = kvm_current_mmio_generation(vcpu);
				383	u64 mask = generation_mmio_spte_mask(gen);
				384	u64 gpa = gfn << PAGE_SHIFT;
				385
				386	access &= ACC_WRITE_MASK \| ACC_USER_MASK;
				387	mask \|= shadow_mmio_value \| access;
				388	mask \|= gpa \| shadow_nonpresent_or_rsvd_mask;
				389	mask \|= (gpa & shadow_nonpresent_or_rsvd_mask)
				390	<< shadow_nonpresent_or_rsvd_mask_len;
				391
				392	trace_mark_mmio_spte(sptep, gfn, access, gen);
				393	mmu_spte_set(sptep, mask);
				394	}
				395
				396	static bool is_mmio_spte(u64 spte)
				397	{
				398	return (spte & shadow_mmio_mask) == shadow_mmio_value;
				399	}
				400
				401	static gfn_t get_mmio_spte_gfn(u64 spte)
				402	{
				403	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
				404
				405	gpa \|= (spte >> shadow_nonpresent_or_rsvd_mask_len)
				406	& shadow_nonpresent_or_rsvd_mask;
				407
				408	return gpa >> PAGE_SHIFT;
				409	}
				410
				411	static unsigned get_mmio_spte_access(u64 spte)
				412	{
				413	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) \| shadow_mmio_mask;
				414	return (spte & ~mask) & ~PAGE_MASK;
				415	}
				416
				417	static bool set_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
				418	kvm_pfn_t pfn, unsigned access)
				419	{
				420	if (unlikely(is_noslot_pfn(pfn))) {
				421	mark_mmio_spte(vcpu, sptep, gfn, access);
				422	return true;
				423	}
				424
				425	return false;
				426	}
				427
				428	static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
				429	{
				430	unsigned int kvm_gen, spte_gen;
				431
				432	kvm_gen = kvm_current_mmio_generation(vcpu);
				433	spte_gen = get_mmio_spte_generation(spte);
				434
				435	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
				436	return likely(kvm_gen == spte_gen);
				437	}
				438
				439	/*
				440	* Sets the shadow PTE masks used by the MMU.
				441	*
				442	* Assumptions:
				443	* - Setting either @accessed_mask or @dirty_mask requires setting both
				444	* - At least one of @accessed_mask or @acc_track_mask must be set
				445	*/
				446	void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
				447	u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
				448	u64 acc_track_mask, u64 me_mask)
				449	{
				450	BUG_ON(!dirty_mask != !accessed_mask);
				451	BUG_ON(!accessed_mask && !acc_track_mask);
				452	BUG_ON(acc_track_mask & shadow_acc_track_value);
				453
				454	shadow_user_mask = user_mask;
				455	shadow_accessed_mask = accessed_mask;
				456	shadow_dirty_mask = dirty_mask;
				457	shadow_nx_mask = nx_mask;
				458	shadow_x_mask = x_mask;
				459	shadow_present_mask = p_mask;
				460	shadow_acc_track_mask = acc_track_mask;
				461	shadow_me_mask = me_mask;
				462	}
				463	EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
				464
				465	static void kvm_mmu_reset_all_pte_masks(void)
				466	{
				467	u8 low_phys_bits;
				468
				469	shadow_user_mask = 0;
				470	shadow_accessed_mask = 0;
				471	shadow_dirty_mask = 0;
				472	shadow_nx_mask = 0;
				473	shadow_x_mask = 0;
				474	shadow_mmio_mask = 0;
				475	shadow_present_mask = 0;
				476	shadow_acc_track_mask = 0;
				477
				478	/*
				479	* If the CPU has 46 or less physical address bits, then set an
				480	* appropriate mask to guard against L1TF attacks. Otherwise, it is
				481	* assumed that the CPU is not vulnerable to L1TF.
				482	*/
				483	low_phys_bits = boot_cpu_data.x86_phys_bits;
				484	if (boot_cpu_data.x86_phys_bits <
				485	52 - shadow_nonpresent_or_rsvd_mask_len) {
				486	shadow_nonpresent_or_rsvd_mask =
				487	rsvd_bits(boot_cpu_data.x86_phys_bits -
				488	shadow_nonpresent_or_rsvd_mask_len,
				489	boot_cpu_data.x86_phys_bits - 1);
				490	low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
				491	}
				492	shadow_nonpresent_or_rsvd_lower_gfn_mask =
				493	GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
				494	}
				495
				496	static int is_cpuid_PSE36(void)
				497	{
				498	return 1;
				499	}
				500
				501	static int is_nx(struct kvm_vcpu *vcpu)
				502	{
				503	return vcpu->arch.efer & EFER_NX;
				504	}
				505
				506	static int is_shadow_present_pte(u64 pte)
				507	{
				508	return (pte != 0) && !is_mmio_spte(pte);
				509	}
				510
				511	static int is_large_pte(u64 pte)
				512	{
				513	return pte & PT_PAGE_SIZE_MASK;
				514	}
				515
				516	static int is_last_spte(u64 pte, int level)
				517	{
				518	if (level == PT_PAGE_TABLE_LEVEL)
				519	return 1;
				520	if (is_large_pte(pte))
				521	return 1;
				522	return 0;
				523	}
				524
				525	static bool is_executable_pte(u64 spte)
				526	{
				527	return (spte & (shadow_x_mask \| shadow_nx_mask)) == shadow_x_mask;
				528	}
				529
				530	static kvm_pfn_t spte_to_pfn(u64 pte)
				531	{
				532	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
				533	}
				534
				535	static gfn_t pse36_gfn_delta(u32 gpte)
				536	{
				537	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
				538
				539	return (gpte & PT32_DIR_PSE36_MASK) << shift;
				540	}
				541
				542	#ifdef CONFIG_X86_64
				543	static void __set_spte(u64 *sptep, u64 spte)
				544	{
				545	WRITE_ONCE(*sptep, spte);
				546	}
				547
				548	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
				549	{
				550	WRITE_ONCE(*sptep, spte);
				551	}
				552
				553	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
				554	{
				555	return xchg(sptep, spte);
				556	}
				557
				558	static u64 __get_spte_lockless(u64 *sptep)
				559	{
				560	return READ_ONCE(*sptep);
				561	}
				562	#else
				563	union split_spte {
				564	struct {
				565	u32 spte_low;
				566	u32 spte_high;
				567	};
				568	u64 spte;
				569	};
				570
				571	static void count_spte_clear(u64 *sptep, u64 spte)
				572	{
				573	struct kvm_mmu_page *sp = page_header(__pa(sptep));
				574
				575	if (is_shadow_present_pte(spte))
				576	return;
				577
				578	/* Ensure the spte is completely set before we increase the count */
				579	smp_wmb();
				580	sp->clear_spte_count++;
				581	}
				582
				583	static void __set_spte(u64 *sptep, u64 spte)
				584	{
				585	union split_spte *ssptep, sspte;
				586
				587	ssptep = (union split_spte *)sptep;
				588	sspte = (union split_spte)spte;
				589
				590	ssptep->spte_high = sspte.spte_high;
				591
				592	/*
				593	* If we map the spte from nonpresent to present, We should store
				594	* the high bits firstly, then set present bit, so cpu can not
				595	* fetch this spte while we are setting the spte.
				596	*/
				597	smp_wmb();
				598
				599	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
				600	}
				601
				602	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
				603	{
				604	union split_spte *ssptep, sspte;
				605
				606	ssptep = (union split_spte *)sptep;
				607	sspte = (union split_spte)spte;
				608
				609	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
				610
				611	/*
				612	* If we map the spte from present to nonpresent, we should clear
				613	* present bit firstly to avoid vcpu fetch the old high bits.
				614	*/
				615	smp_wmb();
				616
				617	ssptep->spte_high = sspte.spte_high;
				618	count_spte_clear(sptep, spte);
				619	}
				620
				621	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
				622	{
				623	union split_spte *ssptep, sspte, orig;
				624
				625	ssptep = (union split_spte *)sptep;
				626	sspte = (union split_spte)spte;
				627
				628	/* xchg acts as a barrier before the setting of the high bits */
				629	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
				630	orig.spte_high = ssptep->spte_high;
				631	ssptep->spte_high = sspte.spte_high;
				632	count_spte_clear(sptep, spte);
				633
				634	return orig.spte;
				635	}
				636
				637	/*
				638	* The idea using the light way get the spte on x86_32 guest is from
				639	* gup_get_pte(arch/x86/mm/gup.c).
				640	*
				641	* An spte tlb flush may be pending, because kvm_set_pte_rmapp
				642	* coalesces them and we are running out of the MMU lock. Therefore
				643	* we need to protect against in-progress updates of the spte.
				644	*
				645	* Reading the spte while an update is in progress may get the old value
				646	* for the high part of the spte. The race is fine for a present->non-present
				647	* change (because the high part of the spte is ignored for non-present spte),
				648	* but for a present->present change we must reread the spte.
				649	*
				650	* All such changes are done in two steps (present->non-present and
				651	* non-present->present), hence it is enough to count the number of
				652	* present->non-present updates: if it changed while reading the spte,
				653	* we might have hit the race. This is done using clear_spte_count.
				654	*/
				655	static u64 __get_spte_lockless(u64 *sptep)
				656	{
				657	struct kvm_mmu_page *sp = page_header(__pa(sptep));
				658	union split_spte spte, orig = (union split_spte )sptep;
				659	int count;
				660
				661	retry:
				662	count = sp->clear_spte_count;
				663	smp_rmb();
				664
				665	spte.spte_low = orig->spte_low;
				666	smp_rmb();
				667
				668	spte.spte_high = orig->spte_high;
				669	smp_rmb();
				670
				671	if (unlikely(spte.spte_low != orig->spte_low \|\|
				672	count != sp->clear_spte_count))
				673	goto retry;
				674
				675	return spte.spte;
				676	}
				677	#endif
				678
				679	static bool spte_can_locklessly_be_made_writable(u64 spte)
				680	{
				681	return (spte & (SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE)) ==
				682	(SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE);
				683	}
				684
				685	static bool spte_has_volatile_bits(u64 spte)
				686	{
				687	if (!is_shadow_present_pte(spte))
				688	return false;
				689
				690	/*
				691	* Always atomically update spte if it can be updated
				692	* out of mmu-lock, it can ensure dirty bit is not lost,
				693	* also, it can help us to get a stable is_writable_pte()
				694	* to ensure tlb flush is not missed.
				695	*/
				696	if (spte_can_locklessly_be_made_writable(spte) \|\|
				697	is_access_track_spte(spte))
				698	return true;
				699
				700	if (spte_ad_enabled(spte)) {
				701	if ((spte & shadow_accessed_mask) == 0 \|\|
				702	(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
				703	return true;
				704	}
				705
				706	return false;
				707	}
				708
				709	static bool is_accessed_spte(u64 spte)
				710	{
				711	u64 accessed_mask = spte_shadow_accessed_mask(spte);
				712
				713	return accessed_mask ? spte & accessed_mask
				714	: !is_access_track_spte(spte);
				715	}
				716
				717	static bool is_dirty_spte(u64 spte)
				718	{
				719	u64 dirty_mask = spte_shadow_dirty_mask(spte);
				720
				721	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
				722	}
				723
				724	/* Rules for using mmu_spte_set:
				725	* Set the sptep from nonpresent to present.
				726	* Note: the sptep being assigned must be either not present
				727	* or in a state where the hardware will not attempt to update
				728	* the spte.
				729	*/
				730	static void mmu_spte_set(u64 *sptep, u64 new_spte)
				731	{
				732	WARN_ON(is_shadow_present_pte(*sptep));
				733	__set_spte(sptep, new_spte);
				734	}
				735
				736	/*
				737	* Update the SPTE (excluding the PFN), but do not track changes in its
				738	* accessed/dirty status.
				739	*/
				740	static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
				741	{
				742	u64 old_spte = *sptep;
				743
				744	WARN_ON(!is_shadow_present_pte(new_spte));
				745
				746	if (!is_shadow_present_pte(old_spte)) {
				747	mmu_spte_set(sptep, new_spte);
				748	return old_spte;
				749	}
				750
				751	if (!spte_has_volatile_bits(old_spte))
				752	__update_clear_spte_fast(sptep, new_spte);
				753	else
				754	old_spte = __update_clear_spte_slow(sptep, new_spte);
				755
				756	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
				757
				758	return old_spte;
				759	}
				760
				761	/* Rules for using mmu_spte_update:
				762	* Update the state bits, it means the mapped pfn is not changed.
				763	*
				764	* Whenever we overwrite a writable spte with a read-only one we
				765	* should flush remote TLBs. Otherwise rmap_write_protect
				766	* will find a read-only spte, even though the writable spte
				767	* might be cached on a CPU's TLB, the return value indicates this
				768	* case.
				769	*
				770	* Returns true if the TLB needs to be flushed
				771	*/
				772	static bool mmu_spte_update(u64 *sptep, u64 new_spte)
				773	{
				774	bool flush = false;
				775	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
				776
				777	if (!is_shadow_present_pte(old_spte))
				778	return false;
				779
				780	/*
				781	* For the spte updated out of mmu-lock is safe, since
				782	* we always atomically update it, see the comments in
				783	* spte_has_volatile_bits().
				784	*/
				785	if (spte_can_locklessly_be_made_writable(old_spte) &&
				786	!is_writable_pte(new_spte))
				787	flush = true;
				788
				789	/*
				790	* Flush TLB when accessed/dirty states are changed in the page tables,
				791	* to guarantee consistency between TLB and page tables.
				792	*/
				793
				794	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
				795	flush = true;
				796	kvm_set_pfn_accessed(spte_to_pfn(old_spte));
				797	}
				798
				799	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
				800	flush = true;
				801	kvm_set_pfn_dirty(spte_to_pfn(old_spte));
				802	}
				803
				804	return flush;
				805	}
				806
				807	/*
				808	* Rules for using mmu_spte_clear_track_bits:
				809	* It sets the sptep from present to nonpresent, and track the
				810	* state bits, it is used to clear the last level sptep.
				811	* Returns non-zero if the PTE was previously valid.
				812	*/
				813	static int mmu_spte_clear_track_bits(u64 *sptep)
				814	{
				815	kvm_pfn_t pfn;
				816	u64 old_spte = *sptep;
				817
				818	if (!spte_has_volatile_bits(old_spte))
				819	__update_clear_spte_fast(sptep, 0ull);
				820	else
				821	old_spte = __update_clear_spte_slow(sptep, 0ull);
				822
				823	if (!is_shadow_present_pte(old_spte))
				824	return 0;
				825
				826	pfn = spte_to_pfn(old_spte);
				827
				828	/*
				829	* KVM does not hold the refcount of the page used by
				830	* kvm mmu, before reclaiming the page, we should
				831	* unmap it from mmu first.
				832	*/
				833	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
				834
				835	if (is_accessed_spte(old_spte))
				836	kvm_set_pfn_accessed(pfn);
				837
				838	if (is_dirty_spte(old_spte))
				839	kvm_set_pfn_dirty(pfn);
				840
				841	return 1;
				842	}
				843
				844	/*
				845	* Rules for using mmu_spte_clear_no_track:
				846	* Directly clear spte without caring the state bits of sptep,
				847	* it is used to set the upper level spte.
				848	*/
				849	static void mmu_spte_clear_no_track(u64 *sptep)
				850	{
				851	__update_clear_spte_fast(sptep, 0ull);
				852	}
				853
				854	static u64 mmu_spte_get_lockless(u64 *sptep)
				855	{
				856	return __get_spte_lockless(sptep);
				857	}
				858
				859	static u64 mark_spte_for_access_track(u64 spte)
				860	{
				861	if (spte_ad_enabled(spte))
				862	return spte & ~shadow_accessed_mask;
				863
				864	if (is_access_track_spte(spte))
				865	return spte;
				866
				867	/*
				868	* Making an Access Tracking PTE will result in removal of write access
				869	* from the PTE. So, verify that we will be able to restore the write
				870	* access in the fast page fault path later on.
				871	*/
				872	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
				873	!spte_can_locklessly_be_made_writable(spte),
				874	"kvm: Writable SPTE is not locklessly dirty-trackable\n");
				875
				876	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
				877	shadow_acc_track_saved_bits_shift),
				878	"kvm: Access Tracking saved bit locations are not zero\n");
				879
				880	spte \|= (spte & shadow_acc_track_saved_bits_mask) <<
				881	shadow_acc_track_saved_bits_shift;
				882	spte &= ~shadow_acc_track_mask;
				883
				884	return spte;
				885	}
				886
				887	/* Restore an acc-track PTE back to a regular PTE */
				888	static u64 restore_acc_track_spte(u64 spte)
				889	{
				890	u64 new_spte = spte;
				891	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
				892	& shadow_acc_track_saved_bits_mask;
				893
				894	WARN_ON_ONCE(spte_ad_enabled(spte));
				895	WARN_ON_ONCE(!is_access_track_spte(spte));
				896
				897	new_spte &= ~shadow_acc_track_mask;
				898	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
				899	shadow_acc_track_saved_bits_shift);
				900	new_spte \|= saved_bits;
				901
				902	return new_spte;
				903	}
				904
				905	/* Returns the Accessed status of the PTE and resets it at the same time. */
				906	static bool mmu_spte_age(u64 *sptep)
				907	{
				908	u64 spte = mmu_spte_get_lockless(sptep);
				909
				910	if (!is_accessed_spte(spte))
				911	return false;
				912
				913	if (spte_ad_enabled(spte)) {
				914	clear_bit((ffs(shadow_accessed_mask) - 1),
				915	(unsigned long *)sptep);
				916	} else {
				917	/*
				918	* Capture the dirty status of the page, so that it doesn't get
				919	* lost when the SPTE is marked for access tracking.
				920	*/
				921	if (is_writable_pte(spte))
				922	kvm_set_pfn_dirty(spte_to_pfn(spte));
				923
				924	spte = mark_spte_for_access_track(spte);
				925	mmu_spte_update_no_track(sptep, spte);
				926	}
				927
				928	return true;
				929	}
				930
				931	static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
				932	{
				933	/*
				934	* Prevent page table teardown by making any free-er wait during
				935	* kvm_flush_remote_tlbs() IPI to all active vcpus.
				936	*/
				937	local_irq_disable();
				938
				939	/*
				940	* Make sure a following spte read is not reordered ahead of the write
				941	* to vcpu->mode.
				942	*/
				943	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
				944	}
				945
				946	static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
				947	{
				948	/*
				949	* Make sure the write to vcpu->mode is not reordered in front of
				950	* reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
				951	* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
				952	*/
				953	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
				954	local_irq_enable();
				955	}
				956
				957	static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
				958	struct kmem_cache *base_cache, int min)
				959	{
				960	void *obj;
				961
				962	if (cache->nobjs >= min)
				963	return 0;
				964	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
				965	obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
				966	if (!obj)
				967	return -ENOMEM;
				968	cache->objects[cache->nobjs++] = obj;
				969	}
				970	return 0;
				971	}
				972
				973	static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
				974	{
				975	return cache->nobjs;
				976	}
				977
				978	static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				979	struct kmem_cache *cache)
				980	{
				981	while (mc->nobjs)
				982	kmem_cache_free(cache, mc->objects[--mc->nobjs]);
				983	}
				984
				985	static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
				986	int min)
				987	{
				988	void *page;
				989
				990	if (cache->nobjs >= min)
				991	return 0;
				992	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
				993	page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
				994	if (!page)
				995	return -ENOMEM;
				996	cache->objects[cache->nobjs++] = page;
				997	}
				998	return 0;
				999	}
				1000
				1001	static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
				1002	{
				1003	while (mc->nobjs)
				1004	free_page((unsigned long)mc->objects[--mc->nobjs]);
				1005	}
				1006
				1007	static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
				1008	{
				1009	int r;
				1010
				1011	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				1012	pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
				1013	if (r)
				1014	goto out;
				1015	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
				1016	if (r)
				1017	goto out;
				1018	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
				1019	mmu_page_header_cache, 4);
				1020	out:
				1021	return r;
				1022	}
				1023
				1024	static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
				1025	{
				1026	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				1027	pte_list_desc_cache);
				1028	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
				1029	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
				1030	mmu_page_header_cache);
				1031	}
				1032
				1033	static void mmu_memory_cache_alloc(struct kvm_mmu_memory_cache mc)
				1034	{
				1035	void *p;
				1036
				1037	BUG_ON(!mc->nobjs);
				1038	p = mc->objects[--mc->nobjs];
				1039	return p;
				1040	}
				1041
				1042	static struct pte_list_desc mmu_alloc_pte_list_desc(struct kvm_vcpu vcpu)
				1043	{
				1044	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
				1045	}
				1046
				1047	static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
				1048	{
				1049	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
				1050	}
				1051
				1052	static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
				1053	{
				1054	if (!sp->role.direct)
				1055	return sp->gfns[index];
				1056
				1057	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
				1058	}
				1059
				1060	static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
				1061	{
				1062	if (!sp->role.direct) {
				1063	sp->gfns[index] = gfn;
				1064	return;
				1065	}
				1066
				1067	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
				1068	pr_err_ratelimited("gfn mismatch under direct page %llx "
				1069	"(expected %llx, got %llx)\n",
				1070	sp->gfn,
				1071	kvm_mmu_page_get_gfn(sp, index), gfn);
				1072	}
				1073
				1074	/*
				1075	* Return the pointer to the large page information for a given gfn,
				1076	* handling slots that are not large page aligned.
				1077	*/
				1078	static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
				1079	struct kvm_memory_slot *slot,
				1080	int level)
				1081	{
				1082	unsigned long idx;
				1083
				1084	idx = gfn_to_index(gfn, slot->base_gfn, level);
				1085	return &slot->arch.lpage_info[level - 2][idx];
				1086	}
				1087
				1088	static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
				1089	gfn_t gfn, int count)
				1090	{
				1091	struct kvm_lpage_info *linfo;
				1092	int i;
				1093
				1094	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1095	linfo = lpage_info_slot(gfn, slot, i);
				1096	linfo->disallow_lpage += count;
				1097	WARN_ON(linfo->disallow_lpage < 0);
				1098	}
				1099	}
				1100
				1101	void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
				1102	{
				1103	update_gfn_disallow_lpage_count(slot, gfn, 1);
				1104	}
				1105
				1106	void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
				1107	{
				1108	update_gfn_disallow_lpage_count(slot, gfn, -1);
				1109	}
				1110
				1111	static void account_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
				1112	{
				1113	struct kvm_memslots *slots;
				1114	struct kvm_memory_slot *slot;
				1115	gfn_t gfn;
				1116
				1117	kvm->arch.indirect_shadow_pages++;
				1118	gfn = sp->gfn;
				1119	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1120	slot = __gfn_to_memslot(slots, gfn);
				1121
				1122	/* the non-leaf shadow pages are keeping readonly. */
				1123	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				1124	return kvm_slot_page_track_add_page(kvm, slot, gfn,
				1125	KVM_PAGE_TRACK_WRITE);
				1126
				1127	kvm_mmu_gfn_disallow_lpage(slot, gfn);
				1128	}
				1129
				1130	static void account_huge_nx_page(struct kvm kvm, struct kvm_mmu_page sp)
				1131	{
				1132	if (sp->lpage_disallowed)
				1133	return;
				1134
				1135	++kvm->stat.nx_lpage_splits;
				1136	list_add_tail(&sp->lpage_disallowed_link,
				1137	&kvm->arch.lpage_disallowed_mmu_pages);
				1138	sp->lpage_disallowed = true;
				1139	}
				1140
				1141	static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
				1142	{
				1143	struct kvm_memslots *slots;
				1144	struct kvm_memory_slot *slot;
				1145	gfn_t gfn;
				1146
				1147	kvm->arch.indirect_shadow_pages--;
				1148	gfn = sp->gfn;
				1149	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1150	slot = __gfn_to_memslot(slots, gfn);
				1151	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				1152	return kvm_slot_page_track_remove_page(kvm, slot, gfn,
				1153	KVM_PAGE_TRACK_WRITE);
				1154
				1155	kvm_mmu_gfn_allow_lpage(slot, gfn);
				1156	}
				1157
				1158	static void unaccount_huge_nx_page(struct kvm kvm, struct kvm_mmu_page sp)
				1159	{
				1160	--kvm->stat.nx_lpage_splits;
				1161	sp->lpage_disallowed = false;
				1162	list_del(&sp->lpage_disallowed_link);
				1163	}
				1164
				1165	static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
				1166	struct kvm_memory_slot *slot)
				1167	{
				1168	struct kvm_lpage_info *linfo;
				1169
				1170	if (slot) {
				1171	linfo = lpage_info_slot(gfn, slot, level);
				1172	return !!linfo->disallow_lpage;
				1173	}
				1174
				1175	return true;
				1176	}
				1177
				1178	static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
				1179	int level)
				1180	{
				1181	struct kvm_memory_slot *slot;
				1182
				1183	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1184	return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
				1185	}
				1186
				1187	static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
				1188	{
				1189	unsigned long page_size;
				1190	int i, ret = 0;
				1191
				1192	page_size = kvm_host_page_size(kvm, gfn);
				1193
				1194	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1195	if (page_size >= KVM_HPAGE_SIZE(i))
				1196	ret = i;
				1197	else
				1198	break;
				1199	}
				1200
				1201	return ret;
				1202	}
				1203
				1204	static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
				1205	bool no_dirty_log)
				1206	{
				1207	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
				1208	return false;
				1209	if (no_dirty_log && slot->dirty_bitmap)
				1210	return false;
				1211
				1212	return true;
				1213	}
				1214
				1215	static struct kvm_memory_slot *
				1216	gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
				1217	bool no_dirty_log)
				1218	{
				1219	struct kvm_memory_slot *slot;
				1220
				1221	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1222	if (!memslot_valid_for_gpte(slot, no_dirty_log))
				1223	slot = NULL;
				1224
				1225	return slot;
				1226	}
				1227
				1228	static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
				1229	bool *force_pt_level)
				1230	{
				1231	int host_level, level, max_level;
				1232	struct kvm_memory_slot *slot;
				1233
				1234	if (unlikely(*force_pt_level))
				1235	return PT_PAGE_TABLE_LEVEL;
				1236
				1237	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
				1238	*force_pt_level = !memslot_valid_for_gpte(slot, true);
				1239	if (unlikely(*force_pt_level))
				1240	return PT_PAGE_TABLE_LEVEL;
				1241
				1242	host_level = host_mapping_level(vcpu->kvm, large_gfn);
				1243
				1244	if (host_level == PT_PAGE_TABLE_LEVEL)
				1245	return host_level;
				1246
				1247	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
				1248
				1249	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
				1250	if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
				1251	break;
				1252
				1253	return level - 1;
				1254	}
				1255
				1256	/*
				1257	* About rmap_head encoding:
				1258	*
				1259	* If the bit zero of rmap_head->val is clear, then it points to the only spte
				1260	* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
				1261	* pte_list_desc containing more mappings.
				1262	*/
				1263
				1264	/*
				1265	* Returns the number of pointers in the rmap chain, not counting the new one.
				1266	*/
				1267	static int pte_list_add(struct kvm_vcpu vcpu, u64 spte,
				1268	struct kvm_rmap_head *rmap_head)
				1269	{
				1270	struct pte_list_desc *desc;
				1271	int i, count = 0;
				1272
				1273	if (!rmap_head->val) {
				1274	rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
				1275	rmap_head->val = (unsigned long)spte;
				1276	} else if (!(rmap_head->val & 1)) {
				1277	rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
				1278	desc = mmu_alloc_pte_list_desc(vcpu);
				1279	desc->sptes[0] = (u64 *)rmap_head->val;
				1280	desc->sptes[1] = spte;
				1281	rmap_head->val = (unsigned long)desc \| 1;
				1282	++count;
				1283	} else {
				1284	rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
				1285	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1286	while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
				1287	desc = desc->more;
				1288	count += PTE_LIST_EXT;
				1289	}
				1290	if (desc->sptes[PTE_LIST_EXT-1]) {
				1291	desc->more = mmu_alloc_pte_list_desc(vcpu);
				1292	desc = desc->more;
				1293	}
				1294	for (i = 0; desc->sptes[i]; ++i)
				1295	++count;
				1296	desc->sptes[i] = spte;
				1297	}
				1298	return count;
				1299	}
				1300
				1301	static void
				1302	pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
				1303	struct pte_list_desc *desc, int i,
				1304	struct pte_list_desc *prev_desc)
				1305	{
				1306	int j;
				1307
				1308	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
				1309	;
				1310	desc->sptes[i] = desc->sptes[j];
				1311	desc->sptes[j] = NULL;
				1312	if (j != 0)
				1313	return;
				1314	if (!prev_desc && !desc->more)
				1315	rmap_head->val = (unsigned long)desc->sptes[0];
				1316	else
				1317	if (prev_desc)
				1318	prev_desc->more = desc->more;
				1319	else
				1320	rmap_head->val = (unsigned long)desc->more \| 1;
				1321	mmu_free_pte_list_desc(desc);
				1322	}
				1323
				1324	static void pte_list_remove(u64 spte, struct kvm_rmap_head rmap_head)
				1325	{
				1326	struct pte_list_desc *desc;
				1327	struct pte_list_desc *prev_desc;
				1328	int i;
				1329
				1330	if (!rmap_head->val) {
				1331	printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
				1332	BUG();
				1333	} else if (!(rmap_head->val & 1)) {
				1334	rmap_printk("pte_list_remove: %p 1->0\n", spte);
				1335	if ((u64 *)rmap_head->val != spte) {
				1336	printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
				1337	BUG();
				1338	}
				1339	rmap_head->val = 0;
				1340	} else {
				1341	rmap_printk("pte_list_remove: %p many->many\n", spte);
				1342	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1343	prev_desc = NULL;
				1344	while (desc) {
				1345	for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
				1346	if (desc->sptes[i] == spte) {
				1347	pte_list_desc_remove_entry(rmap_head,
				1348	desc, i, prev_desc);
				1349	return;
				1350	}
				1351	}
				1352	prev_desc = desc;
				1353	desc = desc->more;
				1354	}
				1355	pr_err("pte_list_remove: %p many->many\n", spte);
				1356	BUG();
				1357	}
				1358	}
				1359
				1360	static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
				1361	struct kvm_memory_slot *slot)
				1362	{
				1363	unsigned long idx;
				1364
				1365	idx = gfn_to_index(gfn, slot->base_gfn, level);
				1366	return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
				1367	}
				1368
				1369	static struct kvm_rmap_head gfn_to_rmap(struct kvm kvm, gfn_t gfn,
				1370	struct kvm_mmu_page *sp)
				1371	{
				1372	struct kvm_memslots *slots;
				1373	struct kvm_memory_slot *slot;
				1374
				1375	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1376	slot = __gfn_to_memslot(slots, gfn);
				1377	return __gfn_to_rmap(gfn, sp->role.level, slot);
				1378	}
				1379
				1380	static bool rmap_can_add(struct kvm_vcpu *vcpu)
				1381	{
				1382	struct kvm_mmu_memory_cache *cache;
				1383
				1384	cache = &vcpu->arch.mmu_pte_list_desc_cache;
				1385	return mmu_memory_cache_free_objects(cache);
				1386	}
				1387
				1388	static int rmap_add(struct kvm_vcpu vcpu, u64 spte, gfn_t gfn)
				1389	{
				1390	struct kvm_mmu_page *sp;
				1391	struct kvm_rmap_head *rmap_head;
				1392
				1393	sp = page_header(__pa(spte));
				1394	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
				1395	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
				1396	return pte_list_add(vcpu, spte, rmap_head);
				1397	}
				1398
				1399	static void rmap_remove(struct kvm kvm, u64 spte)
				1400	{
				1401	struct kvm_mmu_page *sp;
				1402	gfn_t gfn;
				1403	struct kvm_rmap_head *rmap_head;
				1404
				1405	sp = page_header(__pa(spte));
				1406	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
				1407	rmap_head = gfn_to_rmap(kvm, gfn, sp);
				1408	pte_list_remove(spte, rmap_head);
				1409	}
				1410
				1411	/*
				1412	* Used by the following functions to iterate through the sptes linked by a
				1413	* rmap. All fields are private and not assumed to be used outside.
				1414	*/
				1415	struct rmap_iterator {
				1416	/* private fields */
				1417	struct pte_list_desc desc; / holds the sptep if not NULL */
				1418	int pos; /* index of the sptep */
				1419	};
				1420
				1421	/*
				1422	* Iteration must be started by this function. This should also be used after
				1423	* removing/dropping sptes from the rmap link because in such cases the
				1424	* information in the itererator may not be valid.
				1425	*
				1426	* Returns sptep if found, NULL otherwise.
				1427	*/
				1428	static u64 rmap_get_first(struct kvm_rmap_head rmap_head,
				1429	struct rmap_iterator *iter)
				1430	{
				1431	u64 *sptep;
				1432
				1433	if (!rmap_head->val)
				1434	return NULL;
				1435
				1436	if (!(rmap_head->val & 1)) {
				1437	iter->desc = NULL;
				1438	sptep = (u64 *)rmap_head->val;
				1439	goto out;
				1440	}
				1441
				1442	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1443	iter->pos = 0;
				1444	sptep = iter->desc->sptes[iter->pos];
				1445	out:
				1446	BUG_ON(!is_shadow_present_pte(*sptep));
				1447	return sptep;
				1448	}
				1449
				1450	/*
				1451	* Must be used with a valid iterator: e.g. after rmap_get_first().
				1452	*
				1453	* Returns sptep if found, NULL otherwise.
				1454	*/
				1455	static u64 rmap_get_next(struct rmap_iterator iter)
				1456	{
				1457	u64 *sptep;
				1458
				1459	if (iter->desc) {
				1460	if (iter->pos < PTE_LIST_EXT - 1) {
				1461	++iter->pos;
				1462	sptep = iter->desc->sptes[iter->pos];
				1463	if (sptep)
				1464	goto out;
				1465	}
				1466
				1467	iter->desc = iter->desc->more;
				1468
				1469	if (iter->desc) {
				1470	iter->pos = 0;
				1471	/* desc->sptes[0] cannot be NULL */
				1472	sptep = iter->desc->sptes[iter->pos];
				1473	goto out;
				1474	}
				1475	}
				1476
				1477	return NULL;
				1478	out:
				1479	BUG_ON(!is_shadow_present_pte(*sptep));
				1480	return sptep;
				1481	}
				1482
				1483	#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
				1484	for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
				1485	_spte_; _spte_ = rmap_get_next(_iter_))
				1486
				1487	static void drop_spte(struct kvm kvm, u64 sptep)
				1488	{
				1489	if (mmu_spte_clear_track_bits(sptep))
				1490	rmap_remove(kvm, sptep);
				1491	}
				1492
				1493
				1494	static bool __drop_large_spte(struct kvm kvm, u64 sptep)
				1495	{
				1496	if (is_large_pte(*sptep)) {
				1497	WARN_ON(page_header(__pa(sptep))->role.level ==
				1498	PT_PAGE_TABLE_LEVEL);
				1499	drop_spte(kvm, sptep);
				1500	--kvm->stat.lpages;
				1501	return true;
				1502	}
				1503
				1504	return false;
				1505	}
				1506
				1507	static void drop_large_spte(struct kvm_vcpu vcpu, u64 sptep)
				1508	{
				1509	if (__drop_large_spte(vcpu->kvm, sptep))
				1510	kvm_flush_remote_tlbs(vcpu->kvm);
				1511	}
				1512
				1513	/*
				1514	* Write-protect on the specified @sptep, @pt_protect indicates whether
				1515	* spte write-protection is caused by protecting shadow page table.
				1516	*
				1517	* Note: write protection is difference between dirty logging and spte
				1518	* protection:
				1519	* - for dirty logging, the spte can be set to writable at anytime if
				1520	* its dirty bitmap is properly set.
				1521	* - for spte protection, the spte can be writable only after unsync-ing
				1522	* shadow page.
				1523	*
				1524	* Return true if tlb need be flushed.
				1525	*/
				1526	static bool spte_write_protect(u64 *sptep, bool pt_protect)
				1527	{
				1528	u64 spte = *sptep;
				1529
				1530	if (!is_writable_pte(spte) &&
				1531	!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
				1532	return false;
				1533
				1534	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
				1535
				1536	if (pt_protect)
				1537	spte &= ~SPTE_MMU_WRITEABLE;
				1538	spte = spte & ~PT_WRITABLE_MASK;
				1539
				1540	return mmu_spte_update(sptep, spte);
				1541	}
				1542
				1543	static bool __rmap_write_protect(struct kvm *kvm,
				1544	struct kvm_rmap_head *rmap_head,
				1545	bool pt_protect)
				1546	{
				1547	u64 *sptep;
				1548	struct rmap_iterator iter;
				1549	bool flush = false;
				1550
				1551	for_each_rmap_spte(rmap_head, &iter, sptep)
				1552	flush \|= spte_write_protect(sptep, pt_protect);
				1553
				1554	return flush;
				1555	}
				1556
				1557	static bool spte_clear_dirty(u64 *sptep)
				1558	{
				1559	u64 spte = *sptep;
				1560
				1561	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
				1562
				1563	spte &= ~shadow_dirty_mask;
				1564
				1565	return mmu_spte_update(sptep, spte);
				1566	}
				1567
				1568	static bool wrprot_ad_disabled_spte(u64 *sptep)
				1569	{
				1570	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
				1571	(unsigned long *)sptep);
				1572	if (was_writable)
				1573	kvm_set_pfn_dirty(spte_to_pfn(*sptep));
				1574
				1575	return was_writable;
				1576	}
				1577
				1578	/*
				1579	* Gets the GFN ready for another round of dirty logging by clearing the
				1580	* - D bit on ad-enabled SPTEs, and
				1581	* - W bit on ad-disabled SPTEs.
				1582	* Returns true iff any D or W bits were cleared.
				1583	*/
				1584	static bool __rmap_clear_dirty(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1585	{
				1586	u64 *sptep;
				1587	struct rmap_iterator iter;
				1588	bool flush = false;
				1589
				1590	for_each_rmap_spte(rmap_head, &iter, sptep)
				1591	if (spte_ad_enabled(*sptep))
				1592	flush \|= spte_clear_dirty(sptep);
				1593	else
				1594	flush \|= wrprot_ad_disabled_spte(sptep);
				1595
				1596	return flush;
				1597	}
				1598
				1599	static bool spte_set_dirty(u64 *sptep)
				1600	{
				1601	u64 spte = *sptep;
				1602
				1603	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
				1604
				1605	spte \|= shadow_dirty_mask;
				1606
				1607	return mmu_spte_update(sptep, spte);
				1608	}
				1609
				1610	static bool __rmap_set_dirty(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1611	{
				1612	u64 *sptep;
				1613	struct rmap_iterator iter;
				1614	bool flush = false;
				1615
				1616	for_each_rmap_spte(rmap_head, &iter, sptep)
				1617	if (spte_ad_enabled(*sptep))
				1618	flush \|= spte_set_dirty(sptep);
				1619
				1620	return flush;
				1621	}
				1622
				1623	/**
				1624	* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
				1625	* @kvm: kvm instance
				1626	* @slot: slot to protect
				1627	* @gfn_offset: start of the BITS_PER_LONG pages we care about
				1628	* @mask: indicates which pages we should protect
				1629	*
				1630	* Used when we do not need to care about huge page mappings: e.g. during dirty
				1631	* logging we do not have any such mappings.
				1632	*/
				1633	static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
				1634	struct kvm_memory_slot *slot,
				1635	gfn_t gfn_offset, unsigned long mask)
				1636	{
				1637	struct kvm_rmap_head *rmap_head;
				1638
				1639	while (mask) {
				1640	rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
				1641	PT_PAGE_TABLE_LEVEL, slot);
				1642	__rmap_write_protect(kvm, rmap_head, false);
				1643
				1644	/* clear the first set bit */
				1645	mask &= mask - 1;
				1646	}
				1647	}
				1648
				1649	/**
				1650	* kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
				1651	* protect the page if the D-bit isn't supported.
				1652	* @kvm: kvm instance
				1653	* @slot: slot to clear D-bit
				1654	* @gfn_offset: start of the BITS_PER_LONG pages we care about
				1655	* @mask: indicates which pages we should clear D-bit
				1656	*
				1657	* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
				1658	*/
				1659	void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
				1660	struct kvm_memory_slot *slot,
				1661	gfn_t gfn_offset, unsigned long mask)
				1662	{
				1663	struct kvm_rmap_head *rmap_head;
				1664
				1665	while (mask) {
				1666	rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
				1667	PT_PAGE_TABLE_LEVEL, slot);
				1668	__rmap_clear_dirty(kvm, rmap_head);
				1669
				1670	/* clear the first set bit */
				1671	mask &= mask - 1;
				1672	}
				1673	}
				1674	EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
				1675
				1676	/**
				1677	* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
				1678	* PT level pages.
				1679	*
				1680	* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
				1681	* enable dirty logging for them.
				1682	*
				1683	* Used when we do not need to care about huge page mappings: e.g. during dirty
				1684	* logging we do not have any such mappings.
				1685	*/
				1686	void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
				1687	struct kvm_memory_slot *slot,
				1688	gfn_t gfn_offset, unsigned long mask)
				1689	{
				1690	if (kvm_x86_ops->enable_log_dirty_pt_masked)
				1691	kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
				1692	mask);
				1693	else
				1694	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
				1695	}
				1696
				1697	/**
				1698	* kvm_arch_write_log_dirty - emulate dirty page logging
				1699	* @vcpu: Guest mode vcpu
				1700	*
				1701	* Emulate arch specific page modification logging for the
				1702	* nested hypervisor
				1703	*/
				1704	int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
				1705	{
				1706	if (kvm_x86_ops->write_log_dirty)
				1707	return kvm_x86_ops->write_log_dirty(vcpu);
				1708
				1709	return 0;
				1710	}
				1711
				1712	bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
				1713	struct kvm_memory_slot *slot, u64 gfn)
				1714	{
				1715	struct kvm_rmap_head *rmap_head;
				1716	int i;
				1717	bool write_protected = false;
				1718
				1719	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1720	rmap_head = __gfn_to_rmap(gfn, i, slot);
				1721	write_protected \|= __rmap_write_protect(kvm, rmap_head, true);
				1722	}
				1723
				1724	return write_protected;
				1725	}
				1726
				1727	static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
				1728	{
				1729	struct kvm_memory_slot *slot;
				1730
				1731	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1732	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
				1733	}
				1734
				1735	static bool kvm_zap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1736	{
				1737	u64 *sptep;
				1738	struct rmap_iterator iter;
				1739	bool flush = false;
				1740
				1741	while ((sptep = rmap_get_first(rmap_head, &iter))) {
				1742	rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
				1743
				1744	drop_spte(kvm, sptep);
				1745	flush = true;
				1746	}
				1747
				1748	return flush;
				1749	}
				1750
				1751	static int kvm_unmap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1752	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1753	unsigned long data)
				1754	{
				1755	return kvm_zap_rmapp(kvm, rmap_head);
				1756	}
				1757
				1758	static int kvm_set_pte_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1759	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1760	unsigned long data)
				1761	{
				1762	u64 *sptep;
				1763	struct rmap_iterator iter;
				1764	int need_flush = 0;
				1765	u64 new_spte;
				1766	pte_t ptep = (pte_t )data;
				1767	kvm_pfn_t new_pfn;
				1768
				1769	WARN_ON(pte_huge(*ptep));
				1770	new_pfn = pte_pfn(*ptep);
				1771
				1772	restart:
				1773	for_each_rmap_spte(rmap_head, &iter, sptep) {
				1774	rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
				1775	sptep, *sptep, gfn, level);
				1776
				1777	need_flush = 1;
				1778
				1779	if (pte_write(*ptep)) {
				1780	drop_spte(kvm, sptep);
				1781	goto restart;
				1782	} else {
				1783	new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
				1784	new_spte \|= (u64)new_pfn << PAGE_SHIFT;
				1785
				1786	new_spte &= ~PT_WRITABLE_MASK;
				1787	new_spte &= ~SPTE_HOST_WRITEABLE;
				1788
				1789	new_spte = mark_spte_for_access_track(new_spte);
				1790
				1791	mmu_spte_clear_track_bits(sptep);
				1792	mmu_spte_set(sptep, new_spte);
				1793	}
				1794	}
				1795
				1796	if (need_flush)
				1797	kvm_flush_remote_tlbs(kvm);
				1798
				1799	return 0;
				1800	}
				1801
				1802	struct slot_rmap_walk_iterator {
				1803	/* input fields. */
				1804	struct kvm_memory_slot *slot;
				1805	gfn_t start_gfn;
				1806	gfn_t end_gfn;
				1807	int start_level;
				1808	int end_level;
				1809
				1810	/* output fields. */
				1811	gfn_t gfn;
				1812	struct kvm_rmap_head *rmap;
				1813	int level;
				1814
				1815	/* private field. */
				1816	struct kvm_rmap_head *end_rmap;
				1817	};
				1818
				1819	static void
				1820	rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
				1821	{
				1822	iterator->level = level;
				1823	iterator->gfn = iterator->start_gfn;
				1824	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
				1825	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
				1826	iterator->slot);
				1827	}
				1828
				1829	static void
				1830	slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
				1831	struct kvm_memory_slot *slot, int start_level,
				1832	int end_level, gfn_t start_gfn, gfn_t end_gfn)
				1833	{
				1834	iterator->slot = slot;
				1835	iterator->start_level = start_level;
				1836	iterator->end_level = end_level;
				1837	iterator->start_gfn = start_gfn;
				1838	iterator->end_gfn = end_gfn;
				1839
				1840	rmap_walk_init_level(iterator, iterator->start_level);
				1841	}
				1842
				1843	static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
				1844	{
				1845	return !!iterator->rmap;
				1846	}
				1847
				1848	static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
				1849	{
				1850	if (++iterator->rmap <= iterator->end_rmap) {
				1851	iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
				1852	return;
				1853	}
				1854
				1855	if (++iterator->level > iterator->end_level) {
				1856	iterator->rmap = NULL;
				1857	return;
				1858	}
				1859
				1860	rmap_walk_init_level(iterator, iterator->level);
				1861	}
				1862
				1863	#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
				1864	_start_gfn, _end_gfn, _iter_) \
				1865	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
				1866	_end_level_, _start_gfn, _end_gfn); \
				1867	slot_rmap_walk_okay(_iter_); \
				1868	slot_rmap_walk_next(_iter_))
				1869
				1870	static int kvm_handle_hva_range(struct kvm *kvm,
				1871	unsigned long start,
				1872	unsigned long end,
				1873	unsigned long data,
				1874	int (handler)(struct kvm kvm,
				1875	struct kvm_rmap_head *rmap_head,
				1876	struct kvm_memory_slot *slot,
				1877	gfn_t gfn,
				1878	int level,
				1879	unsigned long data))
				1880	{
				1881	struct kvm_memslots *slots;
				1882	struct kvm_memory_slot *memslot;
				1883	struct slot_rmap_walk_iterator iterator;
				1884	int ret = 0;
				1885	int i;
				1886
				1887	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				1888	slots = __kvm_memslots(kvm, i);
				1889	kvm_for_each_memslot(memslot, slots) {
				1890	unsigned long hva_start, hva_end;
				1891	gfn_t gfn_start, gfn_end;
				1892
				1893	hva_start = max(start, memslot->userspace_addr);
				1894	hva_end = min(end, memslot->userspace_addr +
				1895	(memslot->npages << PAGE_SHIFT));
				1896	if (hva_start >= hva_end)
				1897	continue;
				1898	/*
				1899	* {gfn(page) \| page intersects with [hva_start, hva_end)} =
				1900	* {gfn_start, gfn_start+1, ..., gfn_end-1}.
				1901	*/
				1902	gfn_start = hva_to_gfn_memslot(hva_start, memslot);
				1903	gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
				1904
				1905	for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
				1906	PT_MAX_HUGEPAGE_LEVEL,
				1907	gfn_start, gfn_end - 1,
				1908	&iterator)
				1909	ret \|= handler(kvm, iterator.rmap, memslot,
				1910	iterator.gfn, iterator.level, data);
				1911	}
				1912	}
				1913
				1914	return ret;
				1915	}
				1916
				1917	static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
				1918	unsigned long data,
				1919	int (handler)(struct kvm kvm,
				1920	struct kvm_rmap_head *rmap_head,
				1921	struct kvm_memory_slot *slot,
				1922	gfn_t gfn, int level,
				1923	unsigned long data))
				1924	{
				1925	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
				1926	}
				1927
				1928	int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
				1929	{
				1930	return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
				1931	}
				1932
				1933	void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
				1934	{
				1935	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
				1936	}
				1937
				1938	static int kvm_age_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1939	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1940	unsigned long data)
				1941	{
				1942	u64 *sptep;
				1943	struct rmap_iterator uninitialized_var(iter);
				1944	int young = 0;
				1945
				1946	for_each_rmap_spte(rmap_head, &iter, sptep)
				1947	young \|= mmu_spte_age(sptep);
				1948
				1949	trace_kvm_age_page(gfn, level, slot, young);
				1950	return young;
				1951	}
				1952
				1953	static int kvm_test_age_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1954	struct kvm_memory_slot *slot, gfn_t gfn,
				1955	int level, unsigned long data)
				1956	{
				1957	u64 *sptep;
				1958	struct rmap_iterator iter;
				1959
				1960	for_each_rmap_spte(rmap_head, &iter, sptep)
				1961	if (is_accessed_spte(*sptep))
				1962	return 1;
				1963	return 0;
				1964	}
				1965
				1966	#define RMAP_RECYCLE_THRESHOLD 1000
				1967
				1968	static void rmap_recycle(struct kvm_vcpu vcpu, u64 spte, gfn_t gfn)
				1969	{
				1970	struct kvm_rmap_head *rmap_head;
				1971	struct kvm_mmu_page *sp;
				1972
				1973	sp = page_header(__pa(spte));
				1974
				1975	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
				1976
				1977	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
				1978	kvm_flush_remote_tlbs(vcpu->kvm);
				1979	}
				1980
				1981	int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
				1982	{
				1983	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
				1984	}
				1985
				1986	int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
				1987	{
				1988	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
				1989	}
				1990
				1991	#ifdef MMU_DEBUG
				1992	static int is_empty_shadow_page(u64 *spt)
				1993	{
				1994	u64 *pos;
				1995	u64 *end;
				1996
				1997	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
				1998	if (is_shadow_present_pte(*pos)) {
				1999	printk(KERN_ERR "%s: %p %llx\n", __func__,
				2000	pos, *pos);
				2001	return 0;
				2002	}
				2003	return 1;
				2004	}
				2005	#endif
				2006
				2007	/*
				2008	* This value is the sum of all of the kvm instances's
				2009	* kvm->arch.n_used_mmu_pages values. We need a global,
				2010	* aggregate version in order to make the slab shrinker
				2011	* faster
				2012	*/
				2013	static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
				2014	{
				2015	kvm->arch.n_used_mmu_pages += nr;
				2016	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
				2017	}
				2018
				2019	static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
				2020	{
				2021	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
				2022	hlist_del(&sp->hash_link);
				2023	list_del(&sp->link);
				2024	free_page((unsigned long)sp->spt);
				2025	if (!sp->role.direct)
				2026	free_page((unsigned long)sp->gfns);
				2027	kmem_cache_free(mmu_page_header_cache, sp);
				2028	}
				2029
				2030	static unsigned kvm_page_table_hashfn(gfn_t gfn)
				2031	{
				2032	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
				2033	}
				2034
				2035	static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
				2036	struct kvm_mmu_page sp, u64 parent_pte)
				2037	{
				2038	if (!parent_pte)
				2039	return;
				2040
				2041	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
				2042	}
				2043
				2044	static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
				2045	u64 *parent_pte)
				2046	{
				2047	pte_list_remove(parent_pte, &sp->parent_ptes);
				2048	}
				2049
				2050	static void drop_parent_pte(struct kvm_mmu_page *sp,
				2051	u64 *parent_pte)
				2052	{
				2053	mmu_page_remove_parent_pte(sp, parent_pte);
				2054	mmu_spte_clear_no_track(parent_pte);
				2055	}
				2056
				2057	static struct kvm_mmu_page kvm_mmu_alloc_page(struct kvm_vcpu vcpu, int direct)
				2058	{
				2059	struct kvm_mmu_page *sp;
				2060
				2061	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
				2062	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
				2063	if (!direct)
				2064	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
				2065	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
				2066
				2067	/*
				2068	* The active_mmu_pages list is the FIFO list, do not move the
				2069	* page until it is zapped. kvm_zap_obsolete_pages depends on
				2070	* this feature. See the comments in kvm_zap_obsolete_pages().
				2071	*/
				2072	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
				2073	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
				2074	return sp;
				2075	}
				2076
				2077	static void mark_unsync(u64 *spte);
				2078	static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
				2079	{
				2080	u64 *sptep;
				2081	struct rmap_iterator iter;
				2082
				2083	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
				2084	mark_unsync(sptep);
				2085	}
				2086	}
				2087
				2088	static void mark_unsync(u64 *spte)
				2089	{
				2090	struct kvm_mmu_page *sp;
				2091	unsigned int index;
				2092
				2093	sp = page_header(__pa(spte));
				2094	index = spte - sp->spt;
				2095	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
				2096	return;
				2097	if (sp->unsync_children++)
				2098	return;
				2099	kvm_mmu_mark_parents_unsync(sp);
				2100	}
				2101
				2102	static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
				2103	struct kvm_mmu_page *sp)
				2104	{
				2105	return 0;
				2106	}
				2107
				2108	static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
				2109	{
				2110	}
				2111
				2112	static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
				2113	struct kvm_mmu_page sp, u64 spte,
				2114	const void *pte)
				2115	{
				2116	WARN_ON(1);
				2117	}
				2118
				2119	#define KVM_PAGE_ARRAY_NR 16
				2120
				2121	struct kvm_mmu_pages {
				2122	struct mmu_page_and_offset {
				2123	struct kvm_mmu_page *sp;
				2124	unsigned int idx;
				2125	} page[KVM_PAGE_ARRAY_NR];
				2126	unsigned int nr;
				2127	};
				2128
				2129	static int mmu_pages_add(struct kvm_mmu_pages pvec, struct kvm_mmu_page sp,
				2130	int idx)
				2131	{
				2132	int i;
				2133
				2134	if (sp->unsync)
				2135	for (i=0; i < pvec->nr; i++)
				2136	if (pvec->page[i].sp == sp)
				2137	return 0;
				2138
				2139	pvec->page[pvec->nr].sp = sp;
				2140	pvec->page[pvec->nr].idx = idx;
				2141	pvec->nr++;
				2142	return (pvec->nr == KVM_PAGE_ARRAY_NR);
				2143	}
				2144
				2145	static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
				2146	{
				2147	--sp->unsync_children;
				2148	WARN_ON((int)sp->unsync_children < 0);
				2149	__clear_bit(idx, sp->unsync_child_bitmap);
				2150	}
				2151
				2152	static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
				2153	struct kvm_mmu_pages *pvec)
				2154	{
				2155	int i, ret, nr_unsync_leaf = 0;
				2156
				2157	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
				2158	struct kvm_mmu_page *child;
				2159	u64 ent = sp->spt[i];
				2160
				2161	if (!is_shadow_present_pte(ent) \|\| is_large_pte(ent)) {
				2162	clear_unsync_child_bit(sp, i);
				2163	continue;
				2164	}
				2165
				2166	child = page_header(ent & PT64_BASE_ADDR_MASK);
				2167
				2168	if (child->unsync_children) {
				2169	if (mmu_pages_add(pvec, child, i))
				2170	return -ENOSPC;
				2171
				2172	ret = __mmu_unsync_walk(child, pvec);
				2173	if (!ret) {
				2174	clear_unsync_child_bit(sp, i);
				2175	continue;
				2176	} else if (ret > 0) {
				2177	nr_unsync_leaf += ret;
				2178	} else
				2179	return ret;
				2180	} else if (child->unsync) {
				2181	nr_unsync_leaf++;
				2182	if (mmu_pages_add(pvec, child, i))
				2183	return -ENOSPC;
				2184	} else
				2185	clear_unsync_child_bit(sp, i);
				2186	}
				2187
				2188	return nr_unsync_leaf;
				2189	}
				2190
				2191	#define INVALID_INDEX (-1)
				2192
				2193	static int mmu_unsync_walk(struct kvm_mmu_page *sp,
				2194	struct kvm_mmu_pages *pvec)
				2195	{
				2196	pvec->nr = 0;
				2197	if (!sp->unsync_children)
				2198	return 0;
				2199
				2200	mmu_pages_add(pvec, sp, INVALID_INDEX);
				2201	return __mmu_unsync_walk(sp, pvec);
				2202	}
				2203
				2204	static void kvm_unlink_unsync_page(struct kvm kvm, struct kvm_mmu_page sp)
				2205	{
				2206	WARN_ON(!sp->unsync);
				2207	trace_kvm_mmu_sync_page(sp);
				2208	sp->unsync = 0;
				2209	--kvm->stat.mmu_unsync;
				2210	}
				2211
				2212	static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,
				2213	struct list_head *invalid_list);
				2214	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				2215	struct list_head *invalid_list);
				2216
				2217	/*
				2218	* NOTE: we should pay more attention on the zapped-obsolete page
				2219	* (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
				2220	* since it has been deleted from active_mmu_pages but still can be found
				2221	* at hast list.
				2222	*
				2223	* for_each_valid_sp() has skipped that kind of pages.
				2224	*/
				2225	#define for_each_valid_sp(_kvm, _sp, _gfn) \
				2226	hlist_for_each_entry(_sp, \
				2227	&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
				2228	if (is_obsolete_sp((_kvm), (_sp)) \|\| (_sp)->role.invalid) { \
				2229	} else
				2230
				2231	#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
				2232	for_each_valid_sp(_kvm, _sp, _gfn) \
				2233	if ((_sp)->gfn != (_gfn) \|\| (_sp)->role.direct) {} else
				2234
				2235	/* @sp->gfn should be write-protected at the call site */
				2236	static bool __kvm_sync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				2237	struct list_head *invalid_list)
				2238	{
				2239	if (sp->role.cr4_pae != !!is_pae(vcpu)
				2240	\|\| vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
				2241	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
				2242	return false;
				2243	}
				2244
				2245	return true;
				2246	}
				2247
				2248	static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
				2249	struct list_head *invalid_list,
				2250	bool remote_flush, bool local_flush)
				2251	{
				2252	if (!list_empty(invalid_list)) {
				2253	kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
				2254	return;
				2255	}
				2256
				2257	if (remote_flush)
				2258	kvm_flush_remote_tlbs(vcpu->kvm);
				2259	else if (local_flush)
				2260	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2261	}
				2262
				2263	#ifdef CONFIG_KVM_MMU_AUDIT
				2264	#include "mmu_audit.c"
				2265	#else
				2266	static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
				2267	static void mmu_audit_disable(void) { }
				2268	#endif
				2269
				2270	static bool is_obsolete_sp(struct kvm kvm, struct kvm_mmu_page sp)
				2271	{
				2272	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
				2273	}
				2274
				2275	static bool kvm_sync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				2276	struct list_head *invalid_list)
				2277	{
				2278	kvm_unlink_unsync_page(vcpu->kvm, sp);
				2279	return __kvm_sync_page(vcpu, sp, invalid_list);
				2280	}
				2281
				2282	/* @gfn should be write-protected at the call site */
				2283	static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
				2284	struct list_head *invalid_list)
				2285	{
				2286	struct kvm_mmu_page *s;
				2287	bool ret = false;
				2288
				2289	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
				2290	if (!s->unsync)
				2291	continue;
				2292
				2293	WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
				2294	ret \|= kvm_sync_page(vcpu, s, invalid_list);
				2295	}
				2296
				2297	return ret;
				2298	}
				2299
				2300	struct mmu_page_path {
				2301	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
				2302	unsigned int idx[PT64_ROOT_MAX_LEVEL];
				2303	};
				2304
				2305	#define for_each_sp(pvec, sp, parents, i) \
				2306	for (i = mmu_pages_first(&pvec, &parents); \
				2307	i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
				2308	i = mmu_pages_next(&pvec, &parents, i))
				2309
				2310	static int mmu_pages_next(struct kvm_mmu_pages *pvec,
				2311	struct mmu_page_path *parents,
				2312	int i)
				2313	{
				2314	int n;
				2315
				2316	for (n = i+1; n < pvec->nr; n++) {
				2317	struct kvm_mmu_page *sp = pvec->page[n].sp;
				2318	unsigned idx = pvec->page[n].idx;
				2319	int level = sp->role.level;
				2320
				2321	parents->idx[level-1] = idx;
				2322	if (level == PT_PAGE_TABLE_LEVEL)
				2323	break;
				2324
				2325	parents->parent[level-2] = sp;
				2326	}
				2327
				2328	return n;
				2329	}
				2330
				2331	static int mmu_pages_first(struct kvm_mmu_pages *pvec,
				2332	struct mmu_page_path *parents)
				2333	{
				2334	struct kvm_mmu_page *sp;
				2335	int level;
				2336
				2337	if (pvec->nr == 0)
				2338	return 0;
				2339
				2340	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
				2341
				2342	sp = pvec->page[0].sp;
				2343	level = sp->role.level;
				2344	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
				2345
				2346	parents->parent[level-2] = sp;
				2347
				2348	/* Also set up a sentinel. Further entries in pvec are all
				2349	* children of sp, so this element is never overwritten.
				2350	*/
				2351	parents->parent[level-1] = NULL;
				2352	return mmu_pages_next(pvec, parents, 0);
				2353	}
				2354
				2355	static void mmu_pages_clear_parents(struct mmu_page_path *parents)
				2356	{
				2357	struct kvm_mmu_page *sp;
				2358	unsigned int level = 0;
				2359
				2360	do {
				2361	unsigned int idx = parents->idx[level];
				2362	sp = parents->parent[level];
				2363	if (!sp)
				2364	return;
				2365
				2366	WARN_ON(idx == INVALID_INDEX);
				2367	clear_unsync_child_bit(sp, idx);
				2368	level++;
				2369	} while (!sp->unsync_children);
				2370	}
				2371
				2372	static void mmu_sync_children(struct kvm_vcpu *vcpu,
				2373	struct kvm_mmu_page *parent)
				2374	{
				2375	int i;
				2376	struct kvm_mmu_page *sp;
				2377	struct mmu_page_path parents;
				2378	struct kvm_mmu_pages pages;
				2379	LIST_HEAD(invalid_list);
				2380	bool flush = false;
				2381
				2382	while (mmu_unsync_walk(parent, &pages)) {
				2383	bool protected = false;
				2384
				2385	for_each_sp(pages, sp, parents, i)
				2386	protected \|= rmap_write_protect(vcpu, sp->gfn);
				2387
				2388	if (protected) {
				2389	kvm_flush_remote_tlbs(vcpu->kvm);
				2390	flush = false;
				2391	}
				2392
				2393	for_each_sp(pages, sp, parents, i) {
				2394	flush \|= kvm_sync_page(vcpu, sp, &invalid_list);
				2395	mmu_pages_clear_parents(&parents);
				2396	}
				2397	if (need_resched() \|\| spin_needbreak(&vcpu->kvm->mmu_lock)) {
				2398	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2399	cond_resched_lock(&vcpu->kvm->mmu_lock);
				2400	flush = false;
				2401	}
				2402	}
				2403
				2404	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2405	}
				2406
				2407	static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
				2408	{
				2409	atomic_set(&sp->write_flooding_count, 0);
				2410	}
				2411
				2412	static void clear_sp_write_flooding_count(u64 *spte)
				2413	{
				2414	struct kvm_mmu_page *sp = page_header(__pa(spte));
				2415
				2416	__clear_sp_write_flooding_count(sp);
				2417	}
				2418
				2419	static struct kvm_mmu_page kvm_mmu_get_page(struct kvm_vcpu vcpu,
				2420	gfn_t gfn,
				2421	gva_t gaddr,
				2422	unsigned level,
				2423	int direct,
				2424	unsigned access)
				2425	{
				2426	union kvm_mmu_page_role role;
				2427	unsigned quadrant;
				2428	struct kvm_mmu_page *sp;
				2429	bool need_sync = false;
				2430	bool flush = false;
				2431	int collisions = 0;
				2432	LIST_HEAD(invalid_list);
				2433
				2434	role = vcpu->arch.mmu.base_role;
				2435	role.level = level;
				2436	role.direct = direct;
				2437	if (role.direct)
				2438	role.cr4_pae = 0;
				2439	role.access = access;
				2440	if (!vcpu->arch.mmu.direct_map
				2441	&& vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
				2442	quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
				2443	quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
				2444	role.quadrant = quadrant;
				2445	}
				2446	for_each_valid_sp(vcpu->kvm, sp, gfn) {
				2447	if (sp->gfn != gfn) {
				2448	collisions++;
				2449	continue;
				2450	}
				2451
				2452	if (!need_sync && sp->unsync)
				2453	need_sync = true;
				2454
				2455	if (sp->role.word != role.word)
				2456	continue;
				2457
				2458	if (sp->unsync) {
				2459	/* The page is good, but __kvm_sync_page might still end
				2460	* up zapping it. If so, break in order to rebuild it.
				2461	*/
				2462	if (!__kvm_sync_page(vcpu, sp, &invalid_list))
				2463	break;
				2464
				2465	WARN_ON(!list_empty(&invalid_list));
				2466	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2467	}
				2468
				2469	if (sp->unsync_children)
				2470	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
				2471
				2472	__clear_sp_write_flooding_count(sp);
				2473	trace_kvm_mmu_get_page(sp, false);
				2474	goto out;
				2475	}
				2476
				2477	++vcpu->kvm->stat.mmu_cache_miss;
				2478
				2479	sp = kvm_mmu_alloc_page(vcpu, direct);
				2480
				2481	sp->gfn = gfn;
				2482	sp->role = role;
				2483	hlist_add_head(&sp->hash_link,
				2484	&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
				2485	if (!direct) {
				2486	/*
				2487	* we should do write protection before syncing pages
				2488	* otherwise the content of the synced shadow page may
				2489	* be inconsistent with guest page table.
				2490	*/
				2491	account_shadowed(vcpu->kvm, sp);
				2492	if (level == PT_PAGE_TABLE_LEVEL &&
				2493	rmap_write_protect(vcpu, gfn))
				2494	kvm_flush_remote_tlbs(vcpu->kvm);
				2495
				2496	if (level > PT_PAGE_TABLE_LEVEL && need_sync)
				2497	flush \|= kvm_sync_pages(vcpu, gfn, &invalid_list);
				2498	}
				2499	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
				2500	clear_page(sp->spt);
				2501	trace_kvm_mmu_get_page(sp, true);
				2502
				2503	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2504	out:
				2505	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
				2506	vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
				2507	return sp;
				2508	}
				2509
				2510	static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
				2511	struct kvm_vcpu *vcpu, hpa_t root,
				2512	u64 addr)
				2513	{
				2514	iterator->addr = addr;
				2515	iterator->shadow_addr = root;
				2516	iterator->level = vcpu->arch.mmu.shadow_root_level;
				2517
				2518	if (iterator->level == PT64_ROOT_4LEVEL &&
				2519	vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
				2520	!vcpu->arch.mmu.direct_map)
				2521	--iterator->level;
				2522
				2523	if (iterator->level == PT32E_ROOT_LEVEL) {
				2524	/*
				2525	* prev_root is currently only used for 64-bit hosts. So only
				2526	* the active root_hpa is valid here.
				2527	*/
				2528	BUG_ON(root != vcpu->arch.mmu.root_hpa);
				2529
				2530	iterator->shadow_addr
				2531	= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
				2532	iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
				2533	--iterator->level;
				2534	if (!iterator->shadow_addr)
				2535	iterator->level = 0;
				2536	}
				2537	}
				2538
				2539	static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
				2540	struct kvm_vcpu *vcpu, u64 addr)
				2541	{
				2542	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
				2543	addr);
				2544	}
				2545
				2546	static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
				2547	{
				2548	if (iterator->level < PT_PAGE_TABLE_LEVEL)
				2549	return false;
				2550
				2551	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
				2552	iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
				2553	return true;
				2554	}
				2555
				2556	static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
				2557	u64 spte)
				2558	{
				2559	if (is_last_spte(spte, iterator->level)) {
				2560	iterator->level = 0;
				2561	return;
				2562	}
				2563
				2564	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
				2565	--iterator->level;
				2566	}
				2567
				2568	static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
				2569	{
				2570	__shadow_walk_next(iterator, *iterator->sptep);
				2571	}
				2572
				2573	static void link_shadow_page(struct kvm_vcpu vcpu, u64 sptep,
				2574	struct kvm_mmu_page *sp)
				2575	{
				2576	u64 spte;
				2577
				2578	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
				2579
				2580	spte = __pa(sp->spt) \| shadow_present_mask \| PT_WRITABLE_MASK \|
				2581	shadow_user_mask \| shadow_x_mask \| shadow_me_mask;
				2582
				2583	if (sp_ad_disabled(sp))
				2584	spte \|= shadow_acc_track_value;
				2585	else
				2586	spte \|= shadow_accessed_mask;
				2587
				2588	mmu_spte_set(sptep, spte);
				2589
				2590	mmu_page_add_parent_pte(vcpu, sp, sptep);
				2591
				2592	if (sp->unsync_children \|\| sp->unsync)
				2593	mark_unsync(sptep);
				2594	}
				2595
				2596	static void validate_direct_spte(struct kvm_vcpu vcpu, u64 sptep,
				2597	unsigned direct_access)
				2598	{
				2599	if (is_shadow_present_pte(sptep) && !is_large_pte(sptep)) {
				2600	struct kvm_mmu_page *child;
				2601
				2602	/*
				2603	* For the direct sp, if the guest pte's dirty bit
				2604	* changed form clean to dirty, it will corrupt the
				2605	* sp's access: allow writable in the read-only sp,
				2606	* so we should update the spte at this point to get
				2607	* a new sp with the correct access.
				2608	*/
				2609	child = page_header(*sptep & PT64_BASE_ADDR_MASK);
				2610	if (child->role.access == direct_access)
				2611	return;
				2612
				2613	drop_parent_pte(child, sptep);
				2614	kvm_flush_remote_tlbs(vcpu->kvm);
				2615	}
				2616	}
				2617
				2618	static bool mmu_page_zap_pte(struct kvm kvm, struct kvm_mmu_page sp,
				2619	u64 *spte)
				2620	{
				2621	u64 pte;
				2622	struct kvm_mmu_page *child;
				2623
				2624	pte = *spte;
				2625	if (is_shadow_present_pte(pte)) {
				2626	if (is_last_spte(pte, sp->role.level)) {
				2627	drop_spte(kvm, spte);
				2628	if (is_large_pte(pte))
				2629	--kvm->stat.lpages;
				2630	} else {
				2631	child = page_header(pte & PT64_BASE_ADDR_MASK);
				2632	drop_parent_pte(child, spte);
				2633	}
				2634	return true;
				2635	}
				2636
				2637	if (is_mmio_spte(pte))
				2638	mmu_spte_clear_no_track(spte);
				2639
				2640	return false;
				2641	}
				2642
				2643	static void kvm_mmu_page_unlink_children(struct kvm *kvm,
				2644	struct kvm_mmu_page *sp)
				2645	{
				2646	unsigned i;
				2647
				2648	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
				2649	mmu_page_zap_pte(kvm, sp, sp->spt + i);
				2650	}
				2651
				2652	static void kvm_mmu_unlink_parents(struct kvm kvm, struct kvm_mmu_page sp)
				2653	{
				2654	u64 *sptep;
				2655	struct rmap_iterator iter;
				2656
				2657	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
				2658	drop_parent_pte(sp, sptep);
				2659	}
				2660
				2661	static int mmu_zap_unsync_children(struct kvm *kvm,
				2662	struct kvm_mmu_page *parent,
				2663	struct list_head *invalid_list)
				2664	{
				2665	int i, zapped = 0;
				2666	struct mmu_page_path parents;
				2667	struct kvm_mmu_pages pages;
				2668
				2669	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
				2670	return 0;
				2671
				2672	while (mmu_unsync_walk(parent, &pages)) {
				2673	struct kvm_mmu_page *sp;
				2674
				2675	for_each_sp(pages, sp, parents, i) {
				2676	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				2677	mmu_pages_clear_parents(&parents);
				2678	zapped++;
				2679	}
				2680	}
				2681
				2682	return zapped;
				2683	}
				2684
				2685	static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,
				2686	struct list_head *invalid_list)
				2687	{
				2688	int ret;
				2689
				2690	trace_kvm_mmu_prepare_zap_page(sp);
				2691	++kvm->stat.mmu_shadow_zapped;
				2692	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
				2693	kvm_mmu_page_unlink_children(kvm, sp);
				2694	kvm_mmu_unlink_parents(kvm, sp);
				2695
				2696	if (!sp->role.invalid && !sp->role.direct)
				2697	unaccount_shadowed(kvm, sp);
				2698
				2699	if (sp->unsync)
				2700	kvm_unlink_unsync_page(kvm, sp);
				2701	if (!sp->root_count) {
				2702	/* Count self */
				2703	ret++;
				2704	list_move(&sp->link, invalid_list);
				2705	kvm_mod_used_mmu_pages(kvm, -1);
				2706	} else {
				2707	list_move(&sp->link, &kvm->arch.active_mmu_pages);
				2708
				2709	/*
				2710	* The obsolete pages can not be used on any vcpus.
				2711	* See the comments in kvm_mmu_invalidate_zap_all_pages().
				2712	*/
				2713	if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
				2714	kvm_reload_remote_mmus(kvm);
				2715	}
				2716
				2717	if (sp->lpage_disallowed)
				2718	unaccount_huge_nx_page(kvm, sp);
				2719
				2720	sp->role.invalid = 1;
				2721	return ret;
				2722	}
				2723
				2724	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				2725	struct list_head *invalid_list)
				2726	{
				2727	struct kvm_mmu_page sp, nsp;
				2728
				2729	if (list_empty(invalid_list))
				2730	return;
				2731
				2732	/*
				2733	* We need to make sure everyone sees our modifications to
				2734	* the page tables and see changes to vcpu->mode here. The barrier
				2735	* in the kvm_flush_remote_tlbs() achieves this. This pairs
				2736	* with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
				2737	*
				2738	* In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
				2739	* guest mode and/or lockless shadow page table walks.
				2740	*/
				2741	kvm_flush_remote_tlbs(kvm);
				2742
				2743	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
				2744	WARN_ON(!sp->role.invalid \|\| sp->root_count);
				2745	kvm_mmu_free_page(sp);
				2746	}
				2747	}
				2748
				2749	static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
				2750	struct list_head *invalid_list)
				2751	{
				2752	struct kvm_mmu_page *sp;
				2753
				2754	if (list_empty(&kvm->arch.active_mmu_pages))
				2755	return false;
				2756
				2757	sp = list_last_entry(&kvm->arch.active_mmu_pages,
				2758	struct kvm_mmu_page, link);
				2759	return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				2760	}
				2761
				2762	/*
				2763	* Changing the number of mmu pages allocated to the vm
				2764	* Note: if goal_nr_mmu_pages is too small, you will get dead lock
				2765	*/
				2766	void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
				2767	{
				2768	LIST_HEAD(invalid_list);
				2769
				2770	spin_lock(&kvm->mmu_lock);
				2771
				2772	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
				2773	/* Need to free some mmu pages to achieve the goal. */
				2774	while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
				2775	if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
				2776	break;
				2777
				2778	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				2779	goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
				2780	}
				2781
				2782	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
				2783
				2784	spin_unlock(&kvm->mmu_lock);
				2785	}
				2786
				2787	int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
				2788	{
				2789	struct kvm_mmu_page *sp;
				2790	LIST_HEAD(invalid_list);
				2791	int r;
				2792
				2793	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
				2794	r = 0;
				2795	spin_lock(&kvm->mmu_lock);
				2796	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
				2797	pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
				2798	sp->role.word);
				2799	r = 1;
				2800	kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
				2801	}
				2802	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				2803	spin_unlock(&kvm->mmu_lock);
				2804
				2805	return r;
				2806	}
				2807	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
				2808
				2809	static void kvm_unsync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
				2810	{
				2811	trace_kvm_mmu_unsync_page(sp);
				2812	++vcpu->kvm->stat.mmu_unsync;
				2813	sp->unsync = 1;
				2814
				2815	kvm_mmu_mark_parents_unsync(sp);
				2816	}
				2817
				2818	static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
				2819	bool can_unsync)
				2820	{
				2821	struct kvm_mmu_page *sp;
				2822
				2823	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
				2824	return true;
				2825
				2826	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
				2827	if (!can_unsync)
				2828	return true;
				2829
				2830	if (sp->unsync)
				2831	continue;
				2832
				2833	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
				2834	kvm_unsync_page(vcpu, sp);
				2835	}
				2836
				2837	/*
				2838	* We need to ensure that the marking of unsync pages is visible
				2839	* before the SPTE is updated to allow writes because
				2840	* kvm_mmu_sync_roots() checks the unsync flags without holding
				2841	* the MMU lock and so can race with this. If the SPTE was updated
				2842	* before the page had been marked as unsync-ed, something like the
				2843	* following could happen:
				2844	*
				2845	* CPU 1 CPU 2
				2846	* ---------------------------------------------------------------------
				2847	* 1.2 Host updates SPTE
				2848	* to be writable
				2849	* 2.1 Guest writes a GPTE for GVA X.
				2850	* (GPTE being in the guest page table shadowed
				2851	* by the SP from CPU 1.)
				2852	* This reads SPTE during the page table walk.
				2853	* Since SPTE.W is read as 1, there is no
				2854	* fault.
				2855	*
				2856	* 2.2 Guest issues TLB flush.
				2857	* That causes a VM Exit.
				2858	*
				2859	* 2.3 kvm_mmu_sync_pages() reads sp->unsync.
				2860	* Since it is false, so it just returns.
				2861	*
				2862	* 2.4 Guest accesses GVA X.
				2863	* Since the mapping in the SP was not updated,
				2864	* so the old mapping for GVA X incorrectly
				2865	* gets used.
				2866	* 1.1 Host marks SP
				2867	* as unsync
				2868	* (sp->unsync = true)
				2869	*
				2870	* The write barrier below ensures that 1.1 happens before 1.2 and thus
				2871	* the situation in 2.4 does not arise. The implicit barrier in 2.2
				2872	* pairs with this write barrier.
				2873	*/
				2874	smp_wmb();
				2875
				2876	return false;
				2877	}
				2878
				2879	static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
				2880	{
				2881	if (pfn_valid(pfn))
				2882	return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
				2883	/*
				2884	* Some reserved pages, such as those from NVDIMM
				2885	* DAX devices, are not for MMIO, and can be mapped
				2886	* with cached memory type for better performance.
				2887	* However, the above check misconceives those pages
				2888	* as MMIO, and results in KVM mapping them with UC
				2889	* memory type, which would hurt the performance.
				2890	* Therefore, we check the host memory type in addition
				2891	* and only treat UC/UC-/WC pages as MMIO.
				2892	*/
				2893	(!pat_enabled() \|\| pat_pfn_immune_to_uc_mtrr(pfn));
				2894
				2895	return true;
				2896	}
				2897
				2898	/* Bits which may be returned by set_spte() */
				2899	#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
				2900	#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
				2901
				2902	static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
				2903	unsigned pte_access, int level,
				2904	gfn_t gfn, kvm_pfn_t pfn, bool speculative,
				2905	bool can_unsync, bool host_writable)
				2906	{
				2907	u64 spte = 0;
				2908	int ret = 0;
				2909	struct kvm_mmu_page *sp;
				2910
				2911	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
				2912	return 0;
				2913
				2914	sp = page_header(__pa(sptep));
				2915	if (sp_ad_disabled(sp))
				2916	spte \|= shadow_acc_track_value;
				2917
				2918	/*
				2919	* For the EPT case, shadow_present_mask is 0 if hardware
				2920	* supports exec-only page table entries. In that case,
				2921	* ACC_USER_MASK and shadow_user_mask are used to represent
				2922	* read access. See FNAME(gpte_access) in paging_tmpl.h.
				2923	*/
				2924	spte \|= shadow_present_mask;
				2925	if (!speculative)
				2926	spte \|= spte_shadow_accessed_mask(spte);
				2927
				2928	if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
				2929	is_nx_huge_page_enabled()) {
				2930	pte_access &= ~ACC_EXEC_MASK;
				2931	}
				2932
				2933	if (pte_access & ACC_EXEC_MASK)
				2934	spte \|= shadow_x_mask;
				2935	else
				2936	spte \|= shadow_nx_mask;
				2937
				2938	if (pte_access & ACC_USER_MASK)
				2939	spte \|= shadow_user_mask;
				2940
				2941	if (level > PT_PAGE_TABLE_LEVEL)
				2942	spte \|= PT_PAGE_SIZE_MASK;
				2943	if (tdp_enabled)
				2944	spte \|= kvm_x86_ops->get_mt_mask(vcpu, gfn,
				2945	kvm_is_mmio_pfn(pfn));
				2946
				2947	if (host_writable)
				2948	spte \|= SPTE_HOST_WRITEABLE;
				2949	else
				2950	pte_access &= ~ACC_WRITE_MASK;
				2951
				2952	if (!kvm_is_mmio_pfn(pfn))
				2953	spte \|= shadow_me_mask;
				2954
				2955	spte \|= (u64)pfn << PAGE_SHIFT;
				2956
				2957	if (pte_access & ACC_WRITE_MASK) {
				2958
				2959	/*
				2960	* Other vcpu creates new sp in the window between
				2961	* mapping_level() and acquiring mmu-lock. We can
				2962	* allow guest to retry the access, the mapping can
				2963	* be fixed if guest refault.
				2964	*/
				2965	if (level > PT_PAGE_TABLE_LEVEL &&
				2966	mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
				2967	goto done;
				2968
				2969	spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;
				2970
				2971	/*
				2972	* Optimization: for pte sync, if spte was writable the hash
				2973	* lookup is unnecessary (and expensive). Write protection
				2974	* is responsibility of mmu_get_page / kvm_sync_page.
				2975	* Same reasoning can be applied to dirty page accounting.
				2976	*/
				2977	if (!can_unsync && is_writable_pte(*sptep))
				2978	goto set_pte;
				2979
				2980	if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
				2981	pgprintk("%s: found shadow page for %llx, marking ro\n",
				2982	__func__, gfn);
				2983	ret \|= SET_SPTE_WRITE_PROTECTED_PT;
				2984	pte_access &= ~ACC_WRITE_MASK;
				2985	spte &= ~(PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE);
				2986	}
				2987	}
				2988
				2989	if (pte_access & ACC_WRITE_MASK) {
				2990	kvm_vcpu_mark_page_dirty(vcpu, gfn);
				2991	spte \|= spte_shadow_dirty_mask(spte);
				2992	}
				2993
				2994	if (speculative)
				2995	spte = mark_spte_for_access_track(spte);
				2996
				2997	set_pte:
				2998	if (mmu_spte_update(sptep, spte))
				2999	ret \|= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
				3000	done:
				3001	return ret;
				3002	}
				3003
				3004	static int mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep, unsigned pte_access,
				3005	int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
				3006	bool speculative, bool host_writable)
				3007	{
				3008	int was_rmapped = 0;
				3009	int rmap_count;
				3010	int set_spte_ret;
				3011	int ret = RET_PF_RETRY;
				3012	bool flush = false;
				3013
				3014	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
				3015	*sptep, write_fault, gfn);
				3016
				3017	if (is_shadow_present_pte(*sptep)) {
				3018	/*
				3019	* If we overwrite a PTE page pointer with a 2MB PMD, unlink
				3020	* the parent of the now unreachable PTE.
				3021	*/
				3022	if (level > PT_PAGE_TABLE_LEVEL &&
				3023	!is_large_pte(*sptep)) {
				3024	struct kvm_mmu_page *child;
				3025	u64 pte = *sptep;
				3026
				3027	child = page_header(pte & PT64_BASE_ADDR_MASK);
				3028	drop_parent_pte(child, sptep);
				3029	flush = true;
				3030	} else if (pfn != spte_to_pfn(*sptep)) {
				3031	pgprintk("hfn old %llx new %llx\n",
				3032	spte_to_pfn(*sptep), pfn);
				3033	drop_spte(vcpu->kvm, sptep);
				3034	flush = true;
				3035	} else
				3036	was_rmapped = 1;
				3037	}
				3038
				3039	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
				3040	speculative, true, host_writable);
				3041	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
				3042	if (write_fault)
				3043	ret = RET_PF_EMULATE;
				3044	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				3045	}
				3046	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH \|\| flush)
				3047	kvm_flush_remote_tlbs(vcpu->kvm);
				3048
				3049	if (unlikely(is_mmio_spte(*sptep)))
				3050	ret = RET_PF_EMULATE;
				3051
				3052	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
				3053	trace_kvm_mmu_set_spte(level, gfn, sptep);
				3054	if (!was_rmapped && is_large_pte(*sptep))
				3055	++vcpu->kvm->stat.lpages;
				3056
				3057	if (is_shadow_present_pte(*sptep)) {
				3058	if (!was_rmapped) {
				3059	rmap_count = rmap_add(vcpu, sptep, gfn);
				3060	if (rmap_count > RMAP_RECYCLE_THRESHOLD)
				3061	rmap_recycle(vcpu, sptep, gfn);
				3062	}
				3063	}
				3064
				3065	return ret;
				3066	}
				3067
				3068	static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
				3069	bool no_dirty_log)
				3070	{
				3071	struct kvm_memory_slot *slot;
				3072
				3073	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
				3074	if (!slot)
				3075	return KVM_PFN_ERR_FAULT;
				3076
				3077	return gfn_to_pfn_memslot_atomic(slot, gfn);
				3078	}
				3079
				3080	static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
				3081	struct kvm_mmu_page *sp,
				3082	u64 start, u64 end)
				3083	{
				3084	struct page *pages[PTE_PREFETCH_NUM];
				3085	struct kvm_memory_slot *slot;
				3086	unsigned access = sp->role.access;
				3087	int i, ret;
				3088	gfn_t gfn;
				3089
				3090	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
				3091	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
				3092	if (!slot)
				3093	return -1;
				3094
				3095	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
				3096	if (ret <= 0)
				3097	return -1;
				3098
				3099	for (i = 0; i < ret; i++, gfn++, start++) {
				3100	mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
				3101	page_to_pfn(pages[i]), true, true);
				3102	put_page(pages[i]);
				3103	}
				3104
				3105	return 0;
				3106	}
				3107
				3108	static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
				3109	struct kvm_mmu_page sp, u64 sptep)
				3110	{
				3111	u64 spte, start = NULL;
				3112	int i;
				3113
				3114	WARN_ON(!sp->role.direct);
				3115
				3116	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
				3117	spte = sp->spt + i;
				3118
				3119	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
				3120	if (is_shadow_present_pte(*spte) \|\| spte == sptep) {
				3121	if (!start)
				3122	continue;
				3123	if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
				3124	break;
				3125	start = NULL;
				3126	} else if (!start)
				3127	start = spte;
				3128	}
				3129	}
				3130
				3131	static void direct_pte_prefetch(struct kvm_vcpu vcpu, u64 sptep)
				3132	{
				3133	struct kvm_mmu_page *sp;
				3134
				3135	sp = page_header(__pa(sptep));
				3136
				3137	/*
				3138	* Without accessed bits, there's no way to distinguish between
				3139	* actually accessed translations and prefetched, so disable pte
				3140	* prefetch if accessed bits aren't available.
				3141	*/
				3142	if (sp_ad_disabled(sp))
				3143	return;
				3144
				3145	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				3146	return;
				3147
				3148	__direct_pte_prefetch(vcpu, sp, sptep);
				3149	}
				3150
				3151	static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
				3152	gfn_t gfn, kvm_pfn_t pfnp, int levelp)
				3153	{
				3154	int level = *levelp;
				3155	u64 spte = *it.sptep;
				3156
				3157	if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
				3158	is_nx_huge_page_enabled() &&
				3159	is_shadow_present_pte(spte) &&
				3160	!is_large_pte(spte)) {
				3161	/*
				3162	* A small SPTE exists for this pfn, but FNAME(fetch)
				3163	* and __direct_map would like to create a large PTE
				3164	* instead: just force them to go down another level,
				3165	* patching back for them into pfn the next 9 bits of
				3166	* the address.
				3167	*/
				3168	u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
				3169	*pfnp \|= gfn & page_mask;
				3170	(*levelp)--;
				3171	}
				3172	}
				3173
				3174	static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
				3175	int map_writable, int level, kvm_pfn_t pfn,
				3176	bool prefault, bool lpage_disallowed)
				3177	{
				3178	struct kvm_shadow_walk_iterator it;
				3179	struct kvm_mmu_page *sp;
				3180	int ret;
				3181	gfn_t gfn = gpa >> PAGE_SHIFT;
				3182	gfn_t base_gfn = gfn;
				3183
				3184	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3185	return RET_PF_RETRY;
				3186
				3187	trace_kvm_mmu_spte_requested(gpa, level, pfn);
				3188	for_each_shadow_entry(vcpu, gpa, it) {
				3189	/*
				3190	* We cannot overwrite existing page tables with an NX
				3191	* large page, as the leaf could be executable.
				3192	*/
				3193	disallowed_hugepage_adjust(it, gfn, &pfn, &level);
				3194
				3195	base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
				3196	if (it.level == level)
				3197	break;
				3198
				3199	drop_large_spte(vcpu, it.sptep);
				3200	if (!is_shadow_present_pte(*it.sptep)) {
				3201	sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
				3202	it.level - 1, true, ACC_ALL);
				3203
				3204	link_shadow_page(vcpu, it.sptep, sp);
				3205	if (lpage_disallowed)
				3206	account_huge_nx_page(vcpu->kvm, sp);
				3207	}
				3208	}
				3209
				3210	ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
				3211	write, level, base_gfn, pfn, prefault,
				3212	map_writable);
				3213	direct_pte_prefetch(vcpu, it.sptep);
				3214	++vcpu->stat.pf_fixed;
				3215	return ret;
				3216	}
				3217
				3218	static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
				3219	{
				3220	siginfo_t info;
				3221
				3222	clear_siginfo(&info);
				3223	info.si_signo = SIGBUS;
				3224	info.si_errno = 0;
				3225	info.si_code = BUS_MCEERR_AR;
				3226	info.si_addr = (void __user *)address;
				3227	info.si_addr_lsb = PAGE_SHIFT;
				3228
				3229	send_sig_info(SIGBUS, &info, tsk);
				3230	}
				3231
				3232	static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
				3233	{
				3234	/*
				3235	* Do not cache the mmio info caused by writing the readonly gfn
				3236	* into the spte otherwise read access on readonly gfn also can
				3237	* caused mmio page fault and treat it as mmio access.
				3238	*/
				3239	if (pfn == KVM_PFN_ERR_RO_FAULT)
				3240	return RET_PF_EMULATE;
				3241
				3242	if (pfn == KVM_PFN_ERR_HWPOISON) {
				3243	kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
				3244	return RET_PF_RETRY;
				3245	}
				3246
				3247	return -EFAULT;
				3248	}
				3249
				3250	static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
				3251	gfn_t gfn, kvm_pfn_t *pfnp,
				3252	int *levelp)
				3253	{
				3254	kvm_pfn_t pfn = *pfnp;
				3255	int level = *levelp;
				3256
				3257	/*
				3258	* Check if it's a transparent hugepage. If this would be an
				3259	* hugetlbfs page, level wouldn't be set to
				3260	* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
				3261	* here.
				3262	*/
				3263	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
				3264	!kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
				3265	PageTransCompoundMap(pfn_to_page(pfn)) &&
				3266	!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
				3267	unsigned long mask;
				3268	/*
				3269	* mmu_notifier_retry was successful and we hold the
				3270	* mmu_lock here, so the pmd can't become splitting
				3271	* from under us, and in turn
				3272	* __split_huge_page_refcount() can't run from under
				3273	* us and we can safely transfer the refcount from
				3274	* PG_tail to PG_head as we switch the pfn to tail to
				3275	* head.
				3276	*/
				3277	*levelp = level = PT_DIRECTORY_LEVEL;
				3278	mask = KVM_PAGES_PER_HPAGE(level) - 1;
				3279	VM_BUG_ON((gfn & mask) != (pfn & mask));
				3280	if (pfn & mask) {
				3281	kvm_release_pfn_clean(pfn);
				3282	pfn &= ~mask;
				3283	kvm_get_pfn(pfn);
				3284	*pfnp = pfn;
				3285	}
				3286	}
				3287	}
				3288
				3289	static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
				3290	kvm_pfn_t pfn, unsigned access, int *ret_val)
				3291	{
				3292	/* The pfn is invalid, report the error! */
				3293	if (unlikely(is_error_pfn(pfn))) {
				3294	*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
				3295	return true;
				3296	}
				3297
				3298	if (unlikely(is_noslot_pfn(pfn)))
				3299	vcpu_cache_mmio_info(vcpu, gva, gfn, access);
				3300
				3301	return false;
				3302	}
				3303
				3304	static bool page_fault_can_be_fast(u32 error_code)
				3305	{
				3306	/*
				3307	* Do not fix the mmio spte with invalid generation number which
				3308	* need to be updated by slow page fault path.
				3309	*/
				3310	if (unlikely(error_code & PFERR_RSVD_MASK))
				3311	return false;
				3312
				3313	/* See if the page fault is due to an NX violation */
				3314	if (unlikely(((error_code & (PFERR_FETCH_MASK \| PFERR_PRESENT_MASK))
				3315	== (PFERR_FETCH_MASK \| PFERR_PRESENT_MASK))))
				3316	return false;
				3317
				3318	/*
				3319	* #PF can be fast if:
				3320	* 1. The shadow page table entry is not present, which could mean that
				3321	* the fault is potentially caused by access tracking (if enabled).
				3322	* 2. The shadow page table entry is present and the fault
				3323	* is caused by write-protect, that means we just need change the W
				3324	* bit of the spte which can be done out of mmu-lock.
				3325	*
				3326	* However, if access tracking is disabled we know that a non-present
				3327	* page must be a genuine page fault where we have to create a new SPTE.
				3328	* So, if access tracking is disabled, we return true only for write
				3329	* accesses to a present page.
				3330	*/
				3331
				3332	return shadow_acc_track_mask != 0 \|\|
				3333	((error_code & (PFERR_WRITE_MASK \| PFERR_PRESENT_MASK))
				3334	== (PFERR_WRITE_MASK \| PFERR_PRESENT_MASK));
				3335	}
				3336
				3337	/*
				3338	* Returns true if the SPTE was fixed successfully. Otherwise,
				3339	* someone else modified the SPTE from its original value.
				3340	*/
				3341	static bool
				3342	fast_pf_fix_direct_spte(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				3343	u64 *sptep, u64 old_spte, u64 new_spte)
				3344	{
				3345	gfn_t gfn;
				3346
				3347	WARN_ON(!sp->role.direct);
				3348
				3349	/*
				3350	* Theoretically we could also set dirty bit (and flush TLB) here in
				3351	* order to eliminate unnecessary PML logging. See comments in
				3352	* set_spte. But fast_page_fault is very unlikely to happen with PML
				3353	* enabled, so we do not do this. This might result in the same GPA
				3354	* to be logged in PML buffer again when the write really happens, and
				3355	* eventually to be called by mark_page_dirty twice. But it's also no
				3356	* harm. This also avoids the TLB flush needed after setting dirty bit
				3357	* so non-PML cases won't be impacted.
				3358	*
				3359	* Compare with set_spte where instead shadow_dirty_mask is set.
				3360	*/
				3361	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
				3362	return false;
				3363
				3364	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
				3365	/*
				3366	* The gfn of direct spte is stable since it is
				3367	* calculated by sp->gfn.
				3368	*/
				3369	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
				3370	kvm_vcpu_mark_page_dirty(vcpu, gfn);
				3371	}
				3372
				3373	return true;
				3374	}
				3375
				3376	static bool is_access_allowed(u32 fault_err_code, u64 spte)
				3377	{
				3378	if (fault_err_code & PFERR_FETCH_MASK)
				3379	return is_executable_pte(spte);
				3380
				3381	if (fault_err_code & PFERR_WRITE_MASK)
				3382	return is_writable_pte(spte);
				3383
				3384	/* Fault was on Read access */
				3385	return spte & PT_PRESENT_MASK;
				3386	}
				3387
				3388	/*
				3389	* Return value:
				3390	* - true: let the vcpu to access on the same address again.
				3391	* - false: let the real page fault path to fix it.
				3392	*/
				3393	static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
				3394	u32 error_code)
				3395	{
				3396	struct kvm_shadow_walk_iterator iterator;
				3397	struct kvm_mmu_page *sp;
				3398	bool fault_handled = false;
				3399	u64 spte = 0ull;
				3400	uint retry_count = 0;
				3401
				3402	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3403	return false;
				3404
				3405	if (!page_fault_can_be_fast(error_code))
				3406	return false;
				3407
				3408	walk_shadow_page_lockless_begin(vcpu);
				3409
				3410	do {
				3411	u64 new_spte;
				3412
				3413	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
				3414	if (!is_shadow_present_pte(spte) \|\|
				3415	iterator.level < level)
				3416	break;
				3417
				3418	sp = page_header(__pa(iterator.sptep));
				3419	if (!is_last_spte(spte, sp->role.level))
				3420	break;
				3421
				3422	/*
				3423	* Check whether the memory access that caused the fault would
				3424	* still cause it if it were to be performed right now. If not,
				3425	* then this is a spurious fault caused by TLB lazily flushed,
				3426	* or some other CPU has already fixed the PTE after the
				3427	* current CPU took the fault.
				3428	*
				3429	* Need not check the access of upper level table entries since
				3430	* they are always ACC_ALL.
				3431	*/
				3432	if (is_access_allowed(error_code, spte)) {
				3433	fault_handled = true;
				3434	break;
				3435	}
				3436
				3437	new_spte = spte;
				3438
				3439	if (is_access_track_spte(spte))
				3440	new_spte = restore_acc_track_spte(new_spte);
				3441
				3442	/*
				3443	* Currently, to simplify the code, write-protection can
				3444	* be removed in the fast path only if the SPTE was
				3445	* write-protected for dirty-logging or access tracking.
				3446	*/
				3447	if ((error_code & PFERR_WRITE_MASK) &&
				3448	spte_can_locklessly_be_made_writable(spte))
				3449	{
				3450	new_spte \|= PT_WRITABLE_MASK;
				3451
				3452	/*
				3453	* Do not fix write-permission on the large spte. Since
				3454	* we only dirty the first page into the dirty-bitmap in
				3455	* fast_pf_fix_direct_spte(), other pages are missed
				3456	* if its slot has dirty logging enabled.
				3457	*
				3458	* Instead, we let the slow page fault path create a
				3459	* normal spte to fix the access.
				3460	*
				3461	* See the comments in kvm_arch_commit_memory_region().
				3462	*/
				3463	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				3464	break;
				3465	}
				3466
				3467	/* Verify that the fault can be handled in the fast path */
				3468	if (new_spte == spte \|\|
				3469	!is_access_allowed(error_code, new_spte))
				3470	break;
				3471
				3472	/*
				3473	* Currently, fast page fault only works for direct mapping
				3474	* since the gfn is not stable for indirect shadow page. See
				3475	* Documentation/virtual/kvm/locking.txt to get more detail.
				3476	*/
				3477	fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
				3478	iterator.sptep, spte,
				3479	new_spte);
				3480	if (fault_handled)
				3481	break;
				3482
				3483	if (++retry_count > 4) {
				3484	printk_once(KERN_WARNING
				3485	"kvm: Fast #PF retrying more than 4 times.\n");
				3486	break;
				3487	}
				3488
				3489	} while (true);
				3490
				3491	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
				3492	spte, fault_handled);
				3493	walk_shadow_page_lockless_end(vcpu);
				3494
				3495	return fault_handled;
				3496	}
				3497
				3498	static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
				3499	gva_t gva, kvm_pfn_t pfn, bool write, bool writable);
				3500	static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
				3501
				3502	static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
				3503	gfn_t gfn, bool prefault)
				3504	{
				3505	int r;
				3506	int level;
				3507	bool force_pt_level;
				3508	kvm_pfn_t pfn;
				3509	unsigned long mmu_seq;
				3510	bool map_writable, write = error_code & PFERR_WRITE_MASK;
				3511	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
				3512	is_nx_huge_page_enabled();
				3513
				3514	force_pt_level = lpage_disallowed;
				3515	level = mapping_level(vcpu, gfn, &force_pt_level);
				3516	if (likely(!force_pt_level)) {
				3517	/*
				3518	* This path builds a PAE pagetable - so we can map
				3519	* 2mb pages at maximum. Therefore check if the level
				3520	* is larger than that.
				3521	*/
				3522	if (level > PT_DIRECTORY_LEVEL)
				3523	level = PT_DIRECTORY_LEVEL;
				3524
				3525	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
				3526	}
				3527
				3528	if (fast_page_fault(vcpu, v, level, error_code))
				3529	return RET_PF_RETRY;
				3530
				3531	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				3532	smp_rmb();
				3533
				3534	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
				3535	return RET_PF_RETRY;
				3536
				3537	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
				3538	return r;
				3539
				3540	r = RET_PF_RETRY;
				3541	spin_lock(&vcpu->kvm->mmu_lock);
				3542	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				3543	goto out_unlock;
				3544	if (make_mmu_pages_available(vcpu) < 0)
				3545	goto out_unlock;
				3546	if (likely(!force_pt_level))
				3547	transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
				3548	r = __direct_map(vcpu, v, write, map_writable, level, pfn,
				3549	prefault, false);
				3550	out_unlock:
				3551	spin_unlock(&vcpu->kvm->mmu_lock);
				3552	kvm_release_pfn_clean(pfn);
				3553	return r;
				3554	}
				3555
				3556	static void mmu_free_root_page(struct kvm kvm, hpa_t root_hpa,
				3557	struct list_head *invalid_list)
				3558	{
				3559	struct kvm_mmu_page *sp;
				3560
				3561	if (!VALID_PAGE(*root_hpa))
				3562	return;
				3563
				3564	sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
				3565	--sp->root_count;
				3566	if (!sp->root_count && sp->role.invalid)
				3567	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				3568
				3569	*root_hpa = INVALID_PAGE;
				3570	}
				3571
				3572	/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
				3573	void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
				3574	{
				3575	int i;
				3576	LIST_HEAD(invalid_list);
				3577	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				3578	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
				3579
				3580	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
				3581
				3582	/* Before acquiring the MMU lock, see if we need to do any real work. */
				3583	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
				3584	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				3585	if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
				3586	VALID_PAGE(mmu->prev_roots[i].hpa))
				3587	break;
				3588
				3589	if (i == KVM_MMU_NUM_PREV_ROOTS)
				3590	return;
				3591	}
				3592
				3593	spin_lock(&vcpu->kvm->mmu_lock);
				3594
				3595	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				3596	if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
				3597	mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
				3598	&invalid_list);
				3599
				3600	if (free_active_root) {
				3601	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
				3602	(mmu->root_level >= PT64_ROOT_4LEVEL \|\| mmu->direct_map)) {
				3603	mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
				3604	&invalid_list);
				3605	} else {
				3606	for (i = 0; i < 4; ++i)
				3607	if (mmu->pae_root[i] != 0)
				3608	mmu_free_root_page(vcpu->kvm,
				3609	&mmu->pae_root[i],
				3610	&invalid_list);
				3611	mmu->root_hpa = INVALID_PAGE;
				3612	}
				3613	}
				3614
				3615	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				3616	spin_unlock(&vcpu->kvm->mmu_lock);
				3617	}
				3618	EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
				3619
				3620	static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
				3621	{
				3622	int ret = 0;
				3623
				3624	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
				3625	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				3626	ret = 1;
				3627	}
				3628
				3629	return ret;
				3630	}
				3631
				3632	static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
				3633	{
				3634	struct kvm_mmu_page *sp;
				3635	unsigned i;
				3636
				3637	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
				3638	spin_lock(&vcpu->kvm->mmu_lock);
				3639	if(make_mmu_pages_available(vcpu) < 0) {
				3640	spin_unlock(&vcpu->kvm->mmu_lock);
				3641	return -ENOSPC;
				3642	}
				3643	sp = kvm_mmu_get_page(vcpu, 0, 0,
				3644	vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
				3645	++sp->root_count;
				3646	spin_unlock(&vcpu->kvm->mmu_lock);
				3647	vcpu->arch.mmu.root_hpa = __pa(sp->spt);
				3648	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
				3649	for (i = 0; i < 4; ++i) {
				3650	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3651
				3652	MMU_WARN_ON(VALID_PAGE(root));
				3653	spin_lock(&vcpu->kvm->mmu_lock);
				3654	if (make_mmu_pages_available(vcpu) < 0) {
				3655	spin_unlock(&vcpu->kvm->mmu_lock);
				3656	return -ENOSPC;
				3657	}
				3658	sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
				3659	i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
				3660	root = __pa(sp->spt);
				3661	++sp->root_count;
				3662	spin_unlock(&vcpu->kvm->mmu_lock);
				3663	vcpu->arch.mmu.pae_root[i] = root \| PT_PRESENT_MASK;
				3664	}
				3665	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
				3666	} else
				3667	BUG();
				3668
				3669	return 0;
				3670	}
				3671
				3672	static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
				3673	{
				3674	struct kvm_mmu_page *sp;
				3675	u64 pdptr, pm_mask;
				3676	gfn_t root_gfn;
				3677	int i;
				3678
				3679	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
				3680
				3681	if (mmu_check_root(vcpu, root_gfn))
				3682	return 1;
				3683
				3684	/*
				3685	* Do we shadow a long mode page table? If so we need to
				3686	* write-protect the guests page table root.
				3687	*/
				3688	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
				3689	hpa_t root = vcpu->arch.mmu.root_hpa;
				3690
				3691	MMU_WARN_ON(VALID_PAGE(root));
				3692
				3693	spin_lock(&vcpu->kvm->mmu_lock);
				3694	if (make_mmu_pages_available(vcpu) < 0) {
				3695	spin_unlock(&vcpu->kvm->mmu_lock);
				3696	return -ENOSPC;
				3697	}
				3698	sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
				3699	vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
				3700	root = __pa(sp->spt);
				3701	++sp->root_count;
				3702	spin_unlock(&vcpu->kvm->mmu_lock);
				3703	vcpu->arch.mmu.root_hpa = root;
				3704	return 0;
				3705	}
				3706
				3707	/*
				3708	* We shadow a 32 bit page table. This may be a legacy 2-level
				3709	* or a PAE 3-level page table. In either case we need to be aware that
				3710	* the shadow page table may be a PAE or a long mode page table.
				3711	*/
				3712	pm_mask = PT_PRESENT_MASK;
				3713	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
				3714	pm_mask \|= PT_ACCESSED_MASK \| PT_WRITABLE_MASK \| PT_USER_MASK;
				3715
				3716	for (i = 0; i < 4; ++i) {
				3717	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3718
				3719	MMU_WARN_ON(VALID_PAGE(root));
				3720	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
				3721	pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
				3722	if (!(pdptr & PT_PRESENT_MASK)) {
				3723	vcpu->arch.mmu.pae_root[i] = 0;
				3724	continue;
				3725	}
				3726	root_gfn = pdptr >> PAGE_SHIFT;
				3727	if (mmu_check_root(vcpu, root_gfn))
				3728	return 1;
				3729	}
				3730	spin_lock(&vcpu->kvm->mmu_lock);
				3731	if (make_mmu_pages_available(vcpu) < 0) {
				3732	spin_unlock(&vcpu->kvm->mmu_lock);
				3733	return -ENOSPC;
				3734	}
				3735	sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
				3736	0, ACC_ALL);
				3737	root = __pa(sp->spt);
				3738	++sp->root_count;
				3739	spin_unlock(&vcpu->kvm->mmu_lock);
				3740
				3741	vcpu->arch.mmu.pae_root[i] = root \| pm_mask;
				3742	}
				3743	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
				3744
				3745	/*
				3746	* If we shadow a 32 bit page table with a long mode page
				3747	* table we enter this path.
				3748	*/
				3749	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
				3750	if (vcpu->arch.mmu.lm_root == NULL) {
				3751	/*
				3752	* The additional page necessary for this is only
				3753	* allocated on demand.
				3754	*/
				3755
				3756	u64 *lm_root;
				3757
				3758	lm_root = (void*)get_zeroed_page(GFP_KERNEL);
				3759	if (lm_root == NULL)
				3760	return 1;
				3761
				3762	lm_root[0] = __pa(vcpu->arch.mmu.pae_root) \| pm_mask;
				3763
				3764	vcpu->arch.mmu.lm_root = lm_root;
				3765	}
				3766
				3767	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
				3768	}
				3769
				3770	return 0;
				3771	}
				3772
				3773	static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
				3774	{
				3775	if (vcpu->arch.mmu.direct_map)
				3776	return mmu_alloc_direct_roots(vcpu);
				3777	else
				3778	return mmu_alloc_shadow_roots(vcpu);
				3779	}
				3780
				3781	void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
				3782	{
				3783	int i;
				3784	struct kvm_mmu_page *sp;
				3785
				3786	if (vcpu->arch.mmu.direct_map)
				3787	return;
				3788
				3789	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3790	return;
				3791
				3792	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
				3793
				3794	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
				3795	hpa_t root = vcpu->arch.mmu.root_hpa;
				3796
				3797	sp = page_header(root);
				3798
				3799	/*
				3800	* Even if another CPU was marking the SP as unsync-ed
				3801	* simultaneously, any guest page table changes are not
				3802	* guaranteed to be visible anyway until this VCPU issues a TLB
				3803	* flush strictly after those changes are made. We only need to
				3804	* ensure that the other CPU sets these flags before any actual
				3805	* changes to the page tables are made. The comments in
				3806	* mmu_need_write_protect() describe what could go wrong if this
				3807	* requirement isn't satisfied.
				3808	*/
				3809	if (!smp_load_acquire(&sp->unsync) &&
				3810	!smp_load_acquire(&sp->unsync_children))
				3811	return;
				3812
				3813	spin_lock(&vcpu->kvm->mmu_lock);
				3814	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
				3815
				3816	mmu_sync_children(vcpu, sp);
				3817
				3818	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
				3819	spin_unlock(&vcpu->kvm->mmu_lock);
				3820	return;
				3821	}
				3822
				3823	spin_lock(&vcpu->kvm->mmu_lock);
				3824	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
				3825
				3826	for (i = 0; i < 4; ++i) {
				3827	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3828
				3829	if (root && VALID_PAGE(root)) {
				3830	root &= PT64_BASE_ADDR_MASK;
				3831	sp = page_header(root);
				3832	mmu_sync_children(vcpu, sp);
				3833	}
				3834	}
				3835
				3836	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
				3837	spin_unlock(&vcpu->kvm->mmu_lock);
				3838	}
				3839	EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
				3840
				3841	static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
				3842	u32 access, struct x86_exception *exception)
				3843	{
				3844	if (exception)
				3845	exception->error_code = 0;
				3846	return vaddr;
				3847	}
				3848
				3849	static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
				3850	u32 access,
				3851	struct x86_exception *exception)
				3852	{
				3853	if (exception)
				3854	exception->error_code = 0;
				3855	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
				3856	}
				3857
				3858	static bool
				3859	__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
				3860	{
				3861	int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
				3862
				3863	return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) \|
				3864	((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
				3865	}
				3866
				3867	static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
				3868	{
				3869	return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
				3870	}
				3871
				3872	static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
				3873	{
				3874	return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
				3875	}
				3876
				3877	static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
				3878	{
				3879	/*
				3880	* A nested guest cannot use the MMIO cache if it is using nested
				3881	* page tables, because cr2 is a nGPA while the cache stores GPAs.
				3882	*/
				3883	if (mmu_is_nested(vcpu))
				3884	return false;
				3885
				3886	if (direct)
				3887	return vcpu_match_mmio_gpa(vcpu, addr);
				3888
				3889	return vcpu_match_mmio_gva(vcpu, addr);
				3890	}
				3891
				3892	/* return true if reserved bit is detected on spte. */
				3893	static bool
				3894	walk_shadow_page_get_mmio_spte(struct kvm_vcpu vcpu, u64 addr, u64 sptep)
				3895	{
				3896	struct kvm_shadow_walk_iterator iterator;
				3897	u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
				3898	int root, leaf;
				3899	bool reserved = false;
				3900
				3901	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3902	goto exit;
				3903
				3904	walk_shadow_page_lockless_begin(vcpu);
				3905
				3906	for (shadow_walk_init(&iterator, vcpu, addr),
				3907	leaf = root = iterator.level;
				3908	shadow_walk_okay(&iterator);
				3909	__shadow_walk_next(&iterator, spte)) {
				3910	spte = mmu_spte_get_lockless(iterator.sptep);
				3911
				3912	sptes[leaf - 1] = spte;
				3913	leaf--;
				3914
				3915	if (!is_shadow_present_pte(spte))
				3916	break;
				3917
				3918	reserved \|= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
				3919	iterator.level);
				3920	}
				3921
				3922	walk_shadow_page_lockless_end(vcpu);
				3923
				3924	if (reserved) {
				3925	pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
				3926	__func__, addr);
				3927	while (root > leaf) {
				3928	pr_err("------ spte 0x%llx level %d.\n",
				3929	sptes[root - 1], root);
				3930	root--;
				3931	}
				3932	}
				3933	exit:
				3934	*sptep = spte;
				3935	return reserved;
				3936	}
				3937
				3938	static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
				3939	{
				3940	u64 spte;
				3941	bool reserved;
				3942
				3943	if (mmio_info_in_cache(vcpu, addr, direct))
				3944	return RET_PF_EMULATE;
				3945
				3946	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
				3947	if (WARN_ON(reserved))
				3948	return -EINVAL;
				3949
				3950	if (is_mmio_spte(spte)) {
				3951	gfn_t gfn = get_mmio_spte_gfn(spte);
				3952	unsigned access = get_mmio_spte_access(spte);
				3953
				3954	if (!check_mmio_spte(vcpu, spte))
				3955	return RET_PF_INVALID;
				3956
				3957	if (direct)
				3958	addr = 0;
				3959
				3960	trace_handle_mmio_page_fault(addr, gfn, access);
				3961	vcpu_cache_mmio_info(vcpu, addr, gfn, access);
				3962	return RET_PF_EMULATE;
				3963	}
				3964
				3965	/*
				3966	* If the page table is zapped by other cpus, let CPU fault again on
				3967	* the address.
				3968	*/
				3969	return RET_PF_RETRY;
				3970	}
				3971
				3972	static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
				3973	u32 error_code, gfn_t gfn)
				3974	{
				3975	if (unlikely(error_code & PFERR_RSVD_MASK))
				3976	return false;
				3977
				3978	if (!(error_code & PFERR_PRESENT_MASK) \|\|
				3979	!(error_code & PFERR_WRITE_MASK))
				3980	return false;
				3981
				3982	/*
				3983	* guest is writing the page which is write tracked which can
				3984	* not be fixed by page fault handler.
				3985	*/
				3986	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
				3987	return true;
				3988
				3989	return false;
				3990	}
				3991
				3992	static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
				3993	{
				3994	struct kvm_shadow_walk_iterator iterator;
				3995	u64 spte;
				3996
				3997	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3998	return;
				3999
				4000	walk_shadow_page_lockless_begin(vcpu);
				4001	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
				4002	clear_sp_write_flooding_count(iterator.sptep);
				4003	if (!is_shadow_present_pte(spte))
				4004	break;
				4005	}
				4006	walk_shadow_page_lockless_end(vcpu);
				4007	}
				4008
				4009	static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
				4010	u32 error_code, bool prefault)
				4011	{
				4012	gfn_t gfn = gva >> PAGE_SHIFT;
				4013	int r;
				4014
				4015	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
				4016
				4017	if (page_fault_handle_page_track(vcpu, error_code, gfn))
				4018	return RET_PF_EMULATE;
				4019
				4020	r = mmu_topup_memory_caches(vcpu);
				4021	if (r)
				4022	return r;
				4023
				4024	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
				4025
				4026
				4027	return nonpaging_map(vcpu, gva & PAGE_MASK,
				4028	error_code, gfn, prefault);
				4029	}
				4030
				4031	static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
				4032	{
				4033	struct kvm_arch_async_pf arch;
				4034
				4035	arch.token = (vcpu->arch.apf.id++ << 12) \| vcpu->vcpu_id;
				4036	arch.gfn = gfn;
				4037	arch.direct_map = vcpu->arch.mmu.direct_map;
				4038	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
				4039
				4040	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
				4041	}
				4042
				4043	bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
				4044	{
				4045	if (unlikely(!lapic_in_kernel(vcpu) \|\|
				4046	kvm_event_needs_reinjection(vcpu) \|\|
				4047	vcpu->arch.exception.pending))
				4048	return false;
				4049
				4050	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
				4051	return false;
				4052
				4053	return kvm_x86_ops->interrupt_allowed(vcpu);
				4054	}
				4055
				4056	static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
				4057	gva_t gva, kvm_pfn_t pfn, bool write, bool writable)
				4058	{
				4059	struct kvm_memory_slot *slot;
				4060	bool async;
				4061
				4062	/*
				4063	* Don't expose private memslots to L2.
				4064	*/
				4065	if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
				4066	*pfn = KVM_PFN_NOSLOT;
				4067	return false;
				4068	}
				4069
				4070	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				4071	async = false;
				4072	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
				4073	if (!async)
				4074	return false; /* pfn has correct page already /
				4075
				4076	if (!prefault && kvm_can_do_async_pf(vcpu)) {
				4077	trace_kvm_try_async_get_page(gva, gfn);
				4078	if (kvm_find_async_pf_gfn(vcpu, gfn)) {
				4079	trace_kvm_async_pf_doublefault(gva, gfn);
				4080	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
				4081	return true;
				4082	} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
				4083	return true;
				4084	}
				4085
				4086	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
				4087	return false;
				4088	}
				4089
				4090	int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
				4091	u64 fault_address, char *insn, int insn_len)
				4092	{
				4093	int r = 1;
				4094
				4095	vcpu->arch.l1tf_flush_l1d = true;
				4096	switch (vcpu->arch.apf.host_apf_reason) {
				4097	default:
				4098	trace_kvm_page_fault(fault_address, error_code);
				4099
				4100	if (kvm_event_needs_reinjection(vcpu))
				4101	kvm_mmu_unprotect_page_virt(vcpu, fault_address);
				4102	r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
				4103	insn_len);
				4104	break;
				4105	case KVM_PV_REASON_PAGE_NOT_PRESENT:
				4106	vcpu->arch.apf.host_apf_reason = 0;
				4107	local_irq_disable();
				4108	kvm_async_pf_task_wait(fault_address, 0);
				4109	local_irq_enable();
				4110	break;
				4111	case KVM_PV_REASON_PAGE_READY:
				4112	vcpu->arch.apf.host_apf_reason = 0;
				4113	local_irq_disable();
				4114	kvm_async_pf_task_wake(fault_address);
				4115	local_irq_enable();
				4116	break;
				4117	}
				4118	return r;
				4119	}
				4120	EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
				4121
				4122	static bool
				4123	check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
				4124	{
				4125	int page_num = KVM_PAGES_PER_HPAGE(level);
				4126
				4127	gfn &= ~(page_num - 1);
				4128
				4129	return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
				4130	}
				4131
				4132	static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
				4133	bool prefault)
				4134	{
				4135	kvm_pfn_t pfn;
				4136	int r;
				4137	int level;
				4138	bool force_pt_level;
				4139	gfn_t gfn = gpa >> PAGE_SHIFT;
				4140	unsigned long mmu_seq;
				4141	int write = error_code & PFERR_WRITE_MASK;
				4142	bool map_writable;
				4143	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
				4144	is_nx_huge_page_enabled();
				4145
				4146	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
				4147
				4148	if (page_fault_handle_page_track(vcpu, error_code, gfn))
				4149	return RET_PF_EMULATE;
				4150
				4151	r = mmu_topup_memory_caches(vcpu);
				4152	if (r)
				4153	return r;
				4154
				4155	force_pt_level =
				4156	lpage_disallowed \|\|
				4157	!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
				4158	level = mapping_level(vcpu, gfn, &force_pt_level);
				4159	if (likely(!force_pt_level)) {
				4160	if (level > PT_DIRECTORY_LEVEL &&
				4161	!check_hugepage_cache_consistency(vcpu, gfn, level))
				4162	level = PT_DIRECTORY_LEVEL;
				4163	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
				4164	}
				4165
				4166	if (fast_page_fault(vcpu, gpa, level, error_code))
				4167	return RET_PF_RETRY;
				4168
				4169	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				4170	smp_rmb();
				4171
				4172	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
				4173	return RET_PF_RETRY;
				4174
				4175	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
				4176	return r;
				4177
				4178	r = RET_PF_RETRY;
				4179	spin_lock(&vcpu->kvm->mmu_lock);
				4180	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				4181	goto out_unlock;
				4182	if (make_mmu_pages_available(vcpu) < 0)
				4183	goto out_unlock;
				4184	if (likely(!force_pt_level))
				4185	transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
				4186	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
				4187	prefault, lpage_disallowed);
				4188	out_unlock:
				4189	spin_unlock(&vcpu->kvm->mmu_lock);
				4190	kvm_release_pfn_clean(pfn);
				4191	return r;
				4192	}
				4193
				4194	static void nonpaging_init_context(struct kvm_vcpu *vcpu,
				4195	struct kvm_mmu *context)
				4196	{
				4197	context->page_fault = nonpaging_page_fault;
				4198	context->gva_to_gpa = nonpaging_gva_to_gpa;
				4199	context->sync_page = nonpaging_sync_page;
				4200	context->invlpg = nonpaging_invlpg;
				4201	context->update_pte = nonpaging_update_pte;
				4202	context->root_level = 0;
				4203	context->shadow_root_level = PT32E_ROOT_LEVEL;
				4204	context->direct_map = true;
				4205	context->nx = false;
				4206	}
				4207
				4208	/*
				4209	* Find out if a previously cached root matching the new CR3/role is available.
				4210	* The current root is also inserted into the cache.
				4211	* If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
				4212	* returned.
				4213	* Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
				4214	* false is returned. This root should now be freed by the caller.
				4215	*/
				4216	static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
				4217	union kvm_mmu_page_role new_role)
				4218	{
				4219	uint i;
				4220	struct kvm_mmu_root_info root;
				4221	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				4222
				4223	root.cr3 = mmu->get_cr3(vcpu);
				4224	root.hpa = mmu->root_hpa;
				4225
				4226	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
				4227	swap(root, mmu->prev_roots[i]);
				4228
				4229	if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
				4230	page_header(root.hpa) != NULL &&
				4231	new_role.word == page_header(root.hpa)->role.word)
				4232	break;
				4233	}
				4234
				4235	mmu->root_hpa = root.hpa;
				4236
				4237	return i < KVM_MMU_NUM_PREV_ROOTS;
				4238	}
				4239
				4240	static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
				4241	union kvm_mmu_page_role new_role,
				4242	bool skip_tlb_flush)
				4243	{
				4244	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				4245
				4246	/*
				4247	* For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
				4248	* having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
				4249	* later if necessary.
				4250	*/
				4251	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
				4252	mmu->root_level >= PT64_ROOT_4LEVEL) {
				4253	if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
				4254	return false;
				4255
				4256	if (cached_root_available(vcpu, new_cr3, new_role)) {
				4257	/*
				4258	* It is possible that the cached previous root page is
				4259	* obsolete because of a change in the MMU
				4260	* generation number. However, that is accompanied by
				4261	* KVM_REQ_MMU_RELOAD, which will free the root that we
				4262	* have set here and allocate a new one.
				4263	*/
				4264
				4265	kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
				4266	if (!skip_tlb_flush) {
				4267	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
				4268	kvm_x86_ops->tlb_flush(vcpu, true);
				4269	}
				4270
				4271	/*
				4272	* The last MMIO access's GVA and GPA are cached in the
				4273	* VCPU. When switching to a new CR3, that GVA->GPA
				4274	* mapping may no longer be valid. So clear any cached
				4275	* MMIO info even when we don't need to sync the shadow
				4276	* page tables.
				4277	*/
				4278	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
				4279
				4280	__clear_sp_write_flooding_count(
				4281	page_header(mmu->root_hpa));
				4282
				4283	return true;
				4284	}
				4285	}
				4286
				4287	return false;
				4288	}
				4289
				4290	static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
				4291	union kvm_mmu_page_role new_role,
				4292	bool skip_tlb_flush)
				4293	{
				4294	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
				4295	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
				4296	}
				4297
				4298	void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
				4299	{
				4300	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
				4301	skip_tlb_flush);
				4302	}
				4303	EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
				4304
				4305	static unsigned long get_cr3(struct kvm_vcpu *vcpu)
				4306	{
				4307	return kvm_read_cr3(vcpu);
				4308	}
				4309
				4310	static void inject_page_fault(struct kvm_vcpu *vcpu,
				4311	struct x86_exception *fault)
				4312	{
				4313	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
				4314	}
				4315
				4316	static bool sync_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
				4317	unsigned access, int *nr_present)
				4318	{
				4319	if (unlikely(is_mmio_spte(*sptep))) {
				4320	if (gfn != get_mmio_spte_gfn(*sptep)) {
				4321	mmu_spte_clear_no_track(sptep);
				4322	return true;
				4323	}
				4324
				4325	(*nr_present)++;
				4326	mark_mmio_spte(vcpu, sptep, gfn, access);
				4327	return true;
				4328	}
				4329
				4330	return false;
				4331	}
				4332
				4333	static inline bool is_last_gpte(struct kvm_mmu *mmu,
				4334	unsigned level, unsigned gpte)
				4335	{
				4336	/*
				4337	* The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
				4338	* If it is clear, there are no large pages at this level, so clear
				4339	* PT_PAGE_SIZE_MASK in gpte if that is the case.
				4340	*/
				4341	gpte &= level - mmu->last_nonleaf_level;
				4342
				4343	/*
				4344	* PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
				4345	* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
				4346	* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
				4347	*/
				4348	gpte \|= level - PT_PAGE_TABLE_LEVEL - 1;
				4349
				4350	return gpte & PT_PAGE_SIZE_MASK;
				4351	}
				4352
				4353	#define PTTYPE_EPT 18 /* arbitrary */
				4354	#define PTTYPE PTTYPE_EPT
				4355	#include "paging_tmpl.h"
				4356	#undef PTTYPE
				4357
				4358	#define PTTYPE 64
				4359	#include "paging_tmpl.h"
				4360	#undef PTTYPE
				4361
				4362	#define PTTYPE 32
				4363	#include "paging_tmpl.h"
				4364	#undef PTTYPE
				4365
				4366	static void
				4367	__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				4368	struct rsvd_bits_validate *rsvd_check,
				4369	int maxphyaddr, int level, bool nx, bool gbpages,
				4370	bool pse, bool amd)
				4371	{
				4372	u64 exb_bit_rsvd = 0;
				4373	u64 gbpages_bit_rsvd = 0;
				4374	u64 nonleaf_bit8_rsvd = 0;
				4375
				4376	rsvd_check->bad_mt_xwr = 0;
				4377
				4378	if (!nx)
				4379	exb_bit_rsvd = rsvd_bits(63, 63);
				4380	if (!gbpages)
				4381	gbpages_bit_rsvd = rsvd_bits(7, 7);
				4382
				4383	/*
				4384	* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
				4385	* leaf entries) on AMD CPUs only.
				4386	*/
				4387	if (amd)
				4388	nonleaf_bit8_rsvd = rsvd_bits(8, 8);
				4389
				4390	switch (level) {
				4391	case PT32_ROOT_LEVEL:
				4392	/* no rsvd bits for 2 level 4K page table entries */
				4393	rsvd_check->rsvd_bits_mask[0][1] = 0;
				4394	rsvd_check->rsvd_bits_mask[0][0] = 0;
				4395	rsvd_check->rsvd_bits_mask[1][0] =
				4396	rsvd_check->rsvd_bits_mask[0][0];
				4397
				4398	if (!pse) {
				4399	rsvd_check->rsvd_bits_mask[1][1] = 0;
				4400	break;
				4401	}
				4402
				4403	if (is_cpuid_PSE36())
				4404	/* 36bits PSE 4MB page */
				4405	rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
				4406	else
				4407	/* 32 bits PSE 4MB page */
				4408	rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
				4409	break;
				4410	case PT32E_ROOT_LEVEL:
				4411	rsvd_check->rsvd_bits_mask[0][2] =
				4412	rsvd_bits(maxphyaddr, 63) \|
				4413	rsvd_bits(5, 8) \| rsvd_bits(1, 2); /* PDPTE */
				4414	rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd \|
				4415	rsvd_bits(maxphyaddr, 62); /* PDE */
				4416	rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd \|
				4417	rsvd_bits(maxphyaddr, 62); /* PTE */
				4418	rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd \|
				4419	rsvd_bits(maxphyaddr, 62) \|
				4420	rsvd_bits(13, 20); /* large page */
				4421	rsvd_check->rsvd_bits_mask[1][0] =
				4422	rsvd_check->rsvd_bits_mask[0][0];
				4423	break;
				4424	case PT64_ROOT_5LEVEL:
				4425	rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd \|
				4426	nonleaf_bit8_rsvd \| rsvd_bits(7, 7) \|
				4427	rsvd_bits(maxphyaddr, 51);
				4428	rsvd_check->rsvd_bits_mask[1][4] =
				4429	rsvd_check->rsvd_bits_mask[0][4];
				4430	case PT64_ROOT_4LEVEL:
				4431	rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd \|
				4432	nonleaf_bit8_rsvd \| rsvd_bits(7, 7) \|
				4433	rsvd_bits(maxphyaddr, 51);
				4434	rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd \|
				4435	nonleaf_bit8_rsvd \| gbpages_bit_rsvd \|
				4436	rsvd_bits(maxphyaddr, 51);
				4437	rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd \|
				4438	rsvd_bits(maxphyaddr, 51);
				4439	rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd \|
				4440	rsvd_bits(maxphyaddr, 51);
				4441	rsvd_check->rsvd_bits_mask[1][3] =
				4442	rsvd_check->rsvd_bits_mask[0][3];
				4443	rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd \|
				4444	gbpages_bit_rsvd \| rsvd_bits(maxphyaddr, 51) \|
				4445	rsvd_bits(13, 29);
				4446	rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd \|
				4447	rsvd_bits(maxphyaddr, 51) \|
				4448	rsvd_bits(13, 20); /* large page */
				4449	rsvd_check->rsvd_bits_mask[1][0] =
				4450	rsvd_check->rsvd_bits_mask[0][0];
				4451	break;
				4452	}
				4453	}
				4454
				4455	static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				4456	struct kvm_mmu *context)
				4457	{
				4458	__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
				4459	cpuid_maxphyaddr(vcpu), context->root_level,
				4460	context->nx,
				4461	guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
				4462	is_pse(vcpu), guest_cpuid_is_amd(vcpu));
				4463	}
				4464
				4465	static void
				4466	__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
				4467	int maxphyaddr, bool execonly)
				4468	{
				4469	u64 bad_mt_xwr;
				4470
				4471	rsvd_check->rsvd_bits_mask[0][4] =
				4472	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 7);
				4473	rsvd_check->rsvd_bits_mask[0][3] =
				4474	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 7);
				4475	rsvd_check->rsvd_bits_mask[0][2] =
				4476	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 6);
				4477	rsvd_check->rsvd_bits_mask[0][1] =
				4478	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 6);
				4479	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
				4480
				4481	/* large page */
				4482	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
				4483	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
				4484	rsvd_check->rsvd_bits_mask[1][2] =
				4485	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(12, 29);
				4486	rsvd_check->rsvd_bits_mask[1][1] =
				4487	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(12, 20);
				4488	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
				4489
				4490	bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
				4491	bad_mt_xwr \|= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
				4492	bad_mt_xwr \|= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
				4493	bad_mt_xwr \|= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
				4494	bad_mt_xwr \|= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
				4495	if (!execonly) {
				4496	/* bits 0..2 must not be 100 unless VMX capabilities allow it */
				4497	bad_mt_xwr \|= REPEAT_BYTE(1ull << 4);
				4498	}
				4499	rsvd_check->bad_mt_xwr = bad_mt_xwr;
				4500	}
				4501
				4502	static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
				4503	struct kvm_mmu *context, bool execonly)
				4504	{
				4505	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
				4506	cpuid_maxphyaddr(vcpu), execonly);
				4507	}
				4508
				4509	/*
				4510	* the page table on host is the shadow page table for the page
				4511	* table in guest or amd nested guest, its mmu features completely
				4512	* follow the features in guest.
				4513	*/
				4514	void
				4515	reset_shadow_zero_bits_mask(struct kvm_vcpu vcpu, struct kvm_mmu context)
				4516	{
				4517	bool uses_nx = context->nx \|\| context->base_role.smep_andnot_wp;
				4518	struct rsvd_bits_validate *shadow_zero_check;
				4519	int i;
				4520
				4521	/*
				4522	* Passing "true" to the last argument is okay; it adds a check
				4523	* on bit 8 of the SPTEs which KVM doesn't use anyway.
				4524	*/
				4525	shadow_zero_check = &context->shadow_zero_check;
				4526	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
				4527	boot_cpu_data.x86_phys_bits,
				4528	context->shadow_root_level, uses_nx,
				4529	guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
				4530	is_pse(vcpu), true);
				4531
				4532	if (!shadow_me_mask)
				4533	return;
				4534
				4535	for (i = context->shadow_root_level; --i >= 0;) {
				4536	shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
				4537	shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
				4538	}
				4539
				4540	}
				4541	EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
				4542
				4543	static inline bool boot_cpu_is_amd(void)
				4544	{
				4545	WARN_ON_ONCE(!tdp_enabled);
				4546	return shadow_x_mask == 0;
				4547	}
				4548
				4549	/*
				4550	* the direct page table on host, use as much mmu features as
				4551	* possible, however, kvm currently does not do execution-protection.
				4552	*/
				4553	static void
				4554	reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				4555	struct kvm_mmu *context)
				4556	{
				4557	struct rsvd_bits_validate *shadow_zero_check;
				4558	int i;
				4559
				4560	shadow_zero_check = &context->shadow_zero_check;
				4561
				4562	if (boot_cpu_is_amd())
				4563	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
				4564	boot_cpu_data.x86_phys_bits,
				4565	context->shadow_root_level, false,
				4566	boot_cpu_has(X86_FEATURE_GBPAGES),
				4567	true, true);
				4568	else
				4569	__reset_rsvds_bits_mask_ept(shadow_zero_check,
				4570	boot_cpu_data.x86_phys_bits,
				4571	false);
				4572
				4573	if (!shadow_me_mask)
				4574	return;
				4575
				4576	for (i = context->shadow_root_level; --i >= 0;) {
				4577	shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
				4578	shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
				4579	}
				4580	}
				4581
				4582	/*
				4583	* as the comments in reset_shadow_zero_bits_mask() except it
				4584	* is the shadow page table for intel nested guest.
				4585	*/
				4586	static void
				4587	reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				4588	struct kvm_mmu *context, bool execonly)
				4589	{
				4590	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
				4591	boot_cpu_data.x86_phys_bits, execonly);
				4592	}
				4593
				4594	#define BYTE_MASK(access) \
				4595	((1 & (access) ? 2 : 0) \| \
				4596	(2 & (access) ? 4 : 0) \| \
				4597	(3 & (access) ? 8 : 0) \| \
				4598	(4 & (access) ? 16 : 0) \| \
				4599	(5 & (access) ? 32 : 0) \| \
				4600	(6 & (access) ? 64 : 0) \| \
				4601	(7 & (access) ? 128 : 0))
				4602
				4603
				4604	static void update_permission_bitmask(struct kvm_vcpu *vcpu,
				4605	struct kvm_mmu *mmu, bool ept)
				4606	{
				4607	unsigned byte;
				4608
				4609	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
				4610	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
				4611	const u8 u = BYTE_MASK(ACC_USER_MASK);
				4612
				4613	bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
				4614	bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
				4615	bool cr0_wp = is_write_protection(vcpu);
				4616
				4617	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
				4618	unsigned pfec = byte << 1;
				4619
				4620	/*
				4621	* Each "*f" variable has a 1 bit for each UWX value
				4622	* that causes a fault with the given PFEC.
				4623	*/
				4624
				4625	/* Faults from writes to non-writable pages */
				4626	u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
				4627	/* Faults from user mode accesses to supervisor pages */
				4628	u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
				4629	/* Faults from fetches of non-executable pages*/
				4630	u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
				4631	/* Faults from kernel mode fetches of user pages */
				4632	u8 smepf = 0;
				4633	/* Faults from kernel mode accesses of user pages */
				4634	u8 smapf = 0;
				4635
				4636	if (!ept) {
				4637	/* Faults from kernel mode accesses to user pages */
				4638	u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
				4639
				4640	/* Not really needed: !nx will cause pte.nx to fault */
				4641	if (!mmu->nx)
				4642	ff = 0;
				4643
				4644	/* Allow supervisor writes if !cr0.wp */
				4645	if (!cr0_wp)
				4646	wf = (pfec & PFERR_USER_MASK) ? wf : 0;
				4647
				4648	/* Disallow supervisor fetches of user code if cr4.smep */
				4649	if (cr4_smep)
				4650	smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
				4651
				4652	/*
				4653	* SMAP:kernel-mode data accesses from user-mode
				4654	* mappings should fault. A fault is considered
				4655	* as a SMAP violation if all of the following
				4656	* conditions are ture:
				4657	* - X86_CR4_SMAP is set in CR4
				4658	* - A user page is accessed
				4659	* - The access is not a fetch
				4660	* - Page fault in kernel mode
				4661	* - if CPL = 3 or X86_EFLAGS_AC is clear
				4662	*
				4663	* Here, we cover the first three conditions.
				4664	* The fourth is computed dynamically in permission_fault();
				4665	* PFERR_RSVD_MASK bit will be set in PFEC if the access is
				4666	* not subject to SMAP restrictions.
				4667	*/
				4668	if (cr4_smap)
				4669	smapf = (pfec & (PFERR_RSVD_MASK\|PFERR_FETCH_MASK)) ? 0 : kf;
				4670	}
				4671
				4672	mmu->permissions[byte] = ff \| uf \| wf \| smepf \| smapf;
				4673	}
				4674	}
				4675
				4676	/*
				4677	* PKU is an additional mechanism by which the paging controls access to
				4678	* user-mode addresses based on the value in the PKRU register. Protection
				4679	* key violations are reported through a bit in the page fault error code.
				4680	* Unlike other bits of the error code, the PK bit is not known at the
				4681	* call site of e.g. gva_to_gpa; it must be computed directly in
				4682	* permission_fault based on two bits of PKRU, on some machine state (CR4,
				4683	* CR0, EFER, CPL), and on other bits of the error code and the page tables.
				4684	*
				4685	* In particular the following conditions come from the error code, the
				4686	* page tables and the machine state:
				4687	* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
				4688	* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
				4689	* - PK is always zero if U=0 in the page tables
				4690	* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
				4691	*
				4692	* The PKRU bitmask caches the result of these four conditions. The error
				4693	* code (minus the P bit) and the page table's U bit form an index into the
				4694	* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
				4695	* with the two bits of the PKRU register corresponding to the protection key.
				4696	* For the first three conditions above the bits will be 00, thus masking
				4697	* away both AD and WD. For all reads or if the last condition holds, WD
				4698	* only will be masked away.
				4699	*/
				4700	static void update_pkru_bitmask(struct kvm_vcpu vcpu, struct kvm_mmu mmu,
				4701	bool ept)
				4702	{
				4703	unsigned bit;
				4704	bool wp;
				4705
				4706	if (ept) {
				4707	mmu->pkru_mask = 0;
				4708	return;
				4709	}
				4710
				4711	/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
				4712	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\| !is_long_mode(vcpu)) {
				4713	mmu->pkru_mask = 0;
				4714	return;
				4715	}
				4716
				4717	wp = is_write_protection(vcpu);
				4718
				4719	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
				4720	unsigned pfec, pkey_bits;
				4721	bool check_pkey, check_write, ff, uf, wf, pte_user;
				4722
				4723	pfec = bit << 1;
				4724	ff = pfec & PFERR_FETCH_MASK;
				4725	uf = pfec & PFERR_USER_MASK;
				4726	wf = pfec & PFERR_WRITE_MASK;
				4727
				4728	/* PFEC.RSVD is replaced by ACC_USER_MASK. */
				4729	pte_user = pfec & PFERR_RSVD_MASK;
				4730
				4731	/*
				4732	* Only need to check the access which is not an
				4733	* instruction fetch and is to a user page.
				4734	*/
				4735	check_pkey = (!ff && pte_user);
				4736	/*
				4737	* write access is controlled by PKRU if it is a
				4738	* user access or CR0.WP = 1.
				4739	*/
				4740	check_write = check_pkey && wf && (uf \|\| wp);
				4741
				4742	/* PKRU.AD stops both read and write access. */
				4743	pkey_bits = !!check_pkey;
				4744	/* PKRU.WD stops write access. */
				4745	pkey_bits \|= (!!check_write) << 1;
				4746
				4747	mmu->pkru_mask \|= (pkey_bits & 3) << pfec;
				4748	}
				4749	}
				4750
				4751	static void update_last_nonleaf_level(struct kvm_vcpu vcpu, struct kvm_mmu mmu)
				4752	{
				4753	unsigned root_level = mmu->root_level;
				4754
				4755	mmu->last_nonleaf_level = root_level;
				4756	if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
				4757	mmu->last_nonleaf_level++;
				4758	}
				4759
				4760	static void paging64_init_context_common(struct kvm_vcpu *vcpu,
				4761	struct kvm_mmu *context,
				4762	int level)
				4763	{
				4764	context->nx = is_nx(vcpu);
				4765	context->root_level = level;
				4766
				4767	reset_rsvds_bits_mask(vcpu, context);
				4768	update_permission_bitmask(vcpu, context, false);
				4769	update_pkru_bitmask(vcpu, context, false);
				4770	update_last_nonleaf_level(vcpu, context);
				4771
				4772	MMU_WARN_ON(!is_pae(vcpu));
				4773	context->page_fault = paging64_page_fault;
				4774	context->gva_to_gpa = paging64_gva_to_gpa;
				4775	context->sync_page = paging64_sync_page;
				4776	context->invlpg = paging64_invlpg;
				4777	context->update_pte = paging64_update_pte;
				4778	context->shadow_root_level = level;
				4779	context->direct_map = false;
				4780	}
				4781
				4782	static void paging64_init_context(struct kvm_vcpu *vcpu,
				4783	struct kvm_mmu *context)
				4784	{
				4785	int root_level = is_la57_mode(vcpu) ?
				4786	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4787
				4788	paging64_init_context_common(vcpu, context, root_level);
				4789	}
				4790
				4791	static void paging32_init_context(struct kvm_vcpu *vcpu,
				4792	struct kvm_mmu *context)
				4793	{
				4794	context->nx = false;
				4795	context->root_level = PT32_ROOT_LEVEL;
				4796
				4797	reset_rsvds_bits_mask(vcpu, context);
				4798	update_permission_bitmask(vcpu, context, false);
				4799	update_pkru_bitmask(vcpu, context, false);
				4800	update_last_nonleaf_level(vcpu, context);
				4801
				4802	context->page_fault = paging32_page_fault;
				4803	context->gva_to_gpa = paging32_gva_to_gpa;
				4804	context->sync_page = paging32_sync_page;
				4805	context->invlpg = paging32_invlpg;
				4806	context->update_pte = paging32_update_pte;
				4807	context->shadow_root_level = PT32E_ROOT_LEVEL;
				4808	context->direct_map = false;
				4809	}
				4810
				4811	static void paging32E_init_context(struct kvm_vcpu *vcpu,
				4812	struct kvm_mmu *context)
				4813	{
				4814	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
				4815	}
				4816
				4817	static union kvm_mmu_page_role
				4818	kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
				4819	{
				4820	union kvm_mmu_page_role role = {0};
				4821
				4822	role.guest_mode = is_guest_mode(vcpu);
				4823	role.smm = is_smm(vcpu);
				4824	role.ad_disabled = (shadow_accessed_mask == 0);
				4825	role.level = kvm_x86_ops->get_tdp_level(vcpu);
				4826	role.direct = true;
				4827	role.access = ACC_ALL;
				4828
				4829	return role;
				4830	}
				4831
				4832	static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
				4833	{
				4834	struct kvm_mmu *context = &vcpu->arch.mmu;
				4835
				4836	context->base_role.word = mmu_base_role_mask.word &
				4837	kvm_calc_tdp_mmu_root_page_role(vcpu).word;
				4838	context->page_fault = tdp_page_fault;
				4839	context->sync_page = nonpaging_sync_page;
				4840	context->invlpg = nonpaging_invlpg;
				4841	context->update_pte = nonpaging_update_pte;
				4842	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
				4843	context->direct_map = true;
				4844	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
				4845	context->get_cr3 = get_cr3;
				4846	context->get_pdptr = kvm_pdptr_read;
				4847	context->inject_page_fault = kvm_inject_page_fault;
				4848
				4849	if (!is_paging(vcpu)) {
				4850	context->nx = false;
				4851	context->gva_to_gpa = nonpaging_gva_to_gpa;
				4852	context->root_level = 0;
				4853	} else if (is_long_mode(vcpu)) {
				4854	context->nx = is_nx(vcpu);
				4855	context->root_level = is_la57_mode(vcpu) ?
				4856	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4857	reset_rsvds_bits_mask(vcpu, context);
				4858	context->gva_to_gpa = paging64_gva_to_gpa;
				4859	} else if (is_pae(vcpu)) {
				4860	context->nx = is_nx(vcpu);
				4861	context->root_level = PT32E_ROOT_LEVEL;
				4862	reset_rsvds_bits_mask(vcpu, context);
				4863	context->gva_to_gpa = paging64_gva_to_gpa;
				4864	} else {
				4865	context->nx = false;
				4866	context->root_level = PT32_ROOT_LEVEL;
				4867	reset_rsvds_bits_mask(vcpu, context);
				4868	context->gva_to_gpa = paging32_gva_to_gpa;
				4869	}
				4870
				4871	update_permission_bitmask(vcpu, context, false);
				4872	update_pkru_bitmask(vcpu, context, false);
				4873	update_last_nonleaf_level(vcpu, context);
				4874	reset_tdp_shadow_zero_bits_mask(vcpu, context);
				4875	}
				4876
				4877	static union kvm_mmu_page_role
				4878	kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
				4879	{
				4880	union kvm_mmu_page_role role = {0};
				4881	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
				4882	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
				4883
				4884	role.nxe = is_nx(vcpu);
				4885	role.cr4_pae = !!is_pae(vcpu);
				4886	role.cr0_wp = is_write_protection(vcpu);
				4887	role.smep_andnot_wp = smep && !is_write_protection(vcpu);
				4888	role.smap_andnot_wp = smap && !is_write_protection(vcpu);
				4889	role.guest_mode = is_guest_mode(vcpu);
				4890	role.smm = is_smm(vcpu);
				4891	role.direct = !is_paging(vcpu);
				4892	role.access = ACC_ALL;
				4893
				4894	if (!is_long_mode(vcpu))
				4895	role.level = PT32E_ROOT_LEVEL;
				4896	else if (is_la57_mode(vcpu))
				4897	role.level = PT64_ROOT_5LEVEL;
				4898	else
				4899	role.level = PT64_ROOT_4LEVEL;
				4900
				4901	return role;
				4902	}
				4903
				4904	void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
				4905	{
				4906	struct kvm_mmu *context = &vcpu->arch.mmu;
				4907
				4908	if (!is_paging(vcpu))
				4909	nonpaging_init_context(vcpu, context);
				4910	else if (is_long_mode(vcpu))
				4911	paging64_init_context(vcpu, context);
				4912	else if (is_pae(vcpu))
				4913	paging32E_init_context(vcpu, context);
				4914	else
				4915	paging32_init_context(vcpu, context);
				4916
				4917	context->base_role.word = mmu_base_role_mask.word &
				4918	kvm_calc_shadow_mmu_root_page_role(vcpu).word;
				4919	reset_shadow_zero_bits_mask(vcpu, context);
				4920	}
				4921	EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
				4922
				4923	static union kvm_mmu_page_role
				4924	kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
				4925	{
				4926	union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
				4927
				4928	role.level = PT64_ROOT_4LEVEL;
				4929	role.direct = false;
				4930	role.ad_disabled = !accessed_dirty;
				4931	role.guest_mode = true;
				4932	role.access = ACC_ALL;
				4933
				4934	return role;
				4935	}
				4936
				4937	void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
				4938	bool accessed_dirty, gpa_t new_eptp)
				4939	{
				4940	struct kvm_mmu *context = &vcpu->arch.mmu;
				4941	union kvm_mmu_page_role root_page_role =
				4942	kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
				4943
				4944	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
				4945	context->shadow_root_level = PT64_ROOT_4LEVEL;
				4946
				4947	context->nx = true;
				4948	context->ept_ad = accessed_dirty;
				4949	context->page_fault = ept_page_fault;
				4950	context->gva_to_gpa = ept_gva_to_gpa;
				4951	context->sync_page = ept_sync_page;
				4952	context->invlpg = ept_invlpg;
				4953	context->update_pte = ept_update_pte;
				4954	context->root_level = PT64_ROOT_4LEVEL;
				4955	context->direct_map = false;
				4956	context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
				4957	update_permission_bitmask(vcpu, context, true);
				4958	update_pkru_bitmask(vcpu, context, true);
				4959	update_last_nonleaf_level(vcpu, context);
				4960	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
				4961	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
				4962	}
				4963	EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
				4964
				4965	static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
				4966	{
				4967	struct kvm_mmu *context = &vcpu->arch.mmu;
				4968
				4969	kvm_init_shadow_mmu(vcpu);
				4970	context->set_cr3 = kvm_x86_ops->set_cr3;
				4971	context->get_cr3 = get_cr3;
				4972	context->get_pdptr = kvm_pdptr_read;
				4973	context->inject_page_fault = kvm_inject_page_fault;
				4974	}
				4975
				4976	static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
				4977	{
				4978	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
				4979
				4980	g_context->get_cr3 = get_cr3;
				4981	g_context->get_pdptr = kvm_pdptr_read;
				4982	g_context->inject_page_fault = kvm_inject_page_fault;
				4983
				4984	/*
				4985	* Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
				4986	* L1's nested page tables (e.g. EPT12). The nested translation
				4987	* of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
				4988	* L2's page tables as the first level of translation and L1's
				4989	* nested page tables as the second level of translation. Basically
				4990	* the gva_to_gpa functions between mmu and nested_mmu are swapped.
				4991	*/
				4992	if (!is_paging(vcpu)) {
				4993	g_context->nx = false;
				4994	g_context->root_level = 0;
				4995	g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
				4996	} else if (is_long_mode(vcpu)) {
				4997	g_context->nx = is_nx(vcpu);
				4998	g_context->root_level = is_la57_mode(vcpu) ?
				4999	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				5000	reset_rsvds_bits_mask(vcpu, g_context);
				5001	g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
				5002	} else if (is_pae(vcpu)) {
				5003	g_context->nx = is_nx(vcpu);
				5004	g_context->root_level = PT32E_ROOT_LEVEL;
				5005	reset_rsvds_bits_mask(vcpu, g_context);
				5006	g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
				5007	} else {
				5008	g_context->nx = false;
				5009	g_context->root_level = PT32_ROOT_LEVEL;
				5010	reset_rsvds_bits_mask(vcpu, g_context);
				5011	g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
				5012	}
				5013
				5014	update_permission_bitmask(vcpu, g_context, false);
				5015	update_pkru_bitmask(vcpu, g_context, false);
				5016	update_last_nonleaf_level(vcpu, g_context);
				5017	}
				5018
				5019	void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
				5020	{
				5021	if (reset_roots) {
				5022	uint i;
				5023
				5024	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				5025
				5026	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				5027	vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
				5028	}
				5029
				5030	if (mmu_is_nested(vcpu))
				5031	init_kvm_nested_mmu(vcpu);
				5032	else if (tdp_enabled)
				5033	init_kvm_tdp_mmu(vcpu);
				5034	else
				5035	init_kvm_softmmu(vcpu);
				5036	}
				5037	EXPORT_SYMBOL_GPL(kvm_init_mmu);
				5038
				5039	static union kvm_mmu_page_role
				5040	kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
				5041	{
				5042	if (tdp_enabled)
				5043	return kvm_calc_tdp_mmu_root_page_role(vcpu);
				5044	else
				5045	return kvm_calc_shadow_mmu_root_page_role(vcpu);
				5046	}
				5047
				5048	void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
				5049	{
				5050	kvm_mmu_unload(vcpu);
				5051	kvm_init_mmu(vcpu, true);
				5052	}
				5053	EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
				5054
				5055	int kvm_mmu_load(struct kvm_vcpu *vcpu)
				5056	{
				5057	int r;
				5058
				5059	r = mmu_topup_memory_caches(vcpu);
				5060	if (r)
				5061	goto out;
				5062	r = mmu_alloc_roots(vcpu);
				5063	kvm_mmu_sync_roots(vcpu);
				5064	if (r)
				5065	goto out;
				5066	kvm_mmu_load_cr3(vcpu);
				5067	kvm_x86_ops->tlb_flush(vcpu, true);
				5068	out:
				5069	return r;
				5070	}
				5071	EXPORT_SYMBOL_GPL(kvm_mmu_load);
				5072
				5073	void kvm_mmu_unload(struct kvm_vcpu *vcpu)
				5074	{
				5075	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
				5076	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
				5077	}
				5078	EXPORT_SYMBOL_GPL(kvm_mmu_unload);
				5079
				5080	static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
				5081	struct kvm_mmu_page sp, u64 spte,
				5082	const void *new)
				5083	{
				5084	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
				5085	++vcpu->kvm->stat.mmu_pde_zapped;
				5086	return;
				5087	}
				5088
				5089	++vcpu->kvm->stat.mmu_pte_updated;
				5090	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
				5091	}
				5092
				5093	static bool need_remote_flush(u64 old, u64 new)
				5094	{
				5095	if (!is_shadow_present_pte(old))
				5096	return false;
				5097	if (!is_shadow_present_pte(new))
				5098	return true;
				5099	if ((old ^ new) & PT64_BASE_ADDR_MASK)
				5100	return true;
				5101	old ^= shadow_nx_mask;
				5102	new ^= shadow_nx_mask;
				5103	return (old & ~new & PT64_PERM_MASK) != 0;
				5104	}
				5105
				5106	static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu vcpu, gpa_t gpa,
				5107	int *bytes)
				5108	{
				5109	u64 gentry = 0;
				5110	int r;
				5111
				5112	/*
				5113	* Assume that the pte write on a page table of the same type
				5114	* as the current vcpu paging mode since we update the sptes only
				5115	* when they have the same mode.
				5116	*/
				5117	if (is_pae(vcpu) && *bytes == 4) {
				5118	/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
				5119	*gpa &= ~(gpa_t)7;
				5120	*bytes = 8;
				5121	}
				5122
				5123	if (bytes == 4 \|\| bytes == 8) {
				5124	r = kvm_vcpu_read_guest_atomic(vcpu, gpa, &gentry, bytes);
				5125	if (r)
				5126	gentry = 0;
				5127	}
				5128
				5129	return gentry;
				5130	}
				5131
				5132	/*
				5133	* If we're seeing too many writes to a page, it may no longer be a page table,
				5134	* or we may be forking, in which case it is better to unmap the page.
				5135	*/
				5136	static bool detect_write_flooding(struct kvm_mmu_page *sp)
				5137	{
				5138	/*
				5139	* Skip write-flooding detected for the sp whose level is 1, because
				5140	* it can become unsync, then the guest page is not write-protected.
				5141	*/
				5142	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
				5143	return false;
				5144
				5145	atomic_inc(&sp->write_flooding_count);
				5146	return atomic_read(&sp->write_flooding_count) >= 3;
				5147	}
				5148
				5149	/*
				5150	* Misaligned accesses are too much trouble to fix up; also, they usually
				5151	* indicate a page is not used as a page table.
				5152	*/
				5153	static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
				5154	int bytes)
				5155	{
				5156	unsigned offset, pte_size, misaligned;
				5157
				5158	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
				5159	gpa, bytes, sp->role.word);
				5160
				5161	offset = offset_in_page(gpa);
				5162	pte_size = sp->role.cr4_pae ? 8 : 4;
				5163
				5164	/*
				5165	* Sometimes, the OS only writes the last one bytes to update status
				5166	* bits, for example, in linux, andb instruction is used in clear_bit().
				5167	*/
				5168	if (!(offset & (pte_size - 1)) && bytes == 1)
				5169	return false;
				5170
				5171	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
				5172	misaligned \|= bytes < 4;
				5173
				5174	return misaligned;
				5175	}
				5176
				5177	static u64 get_written_sptes(struct kvm_mmu_page sp, gpa_t gpa, int *nspte)
				5178	{
				5179	unsigned page_offset, quadrant;
				5180	u64 *spte;
				5181	int level;
				5182
				5183	page_offset = offset_in_page(gpa);
				5184	level = sp->role.level;
				5185	*nspte = 1;
				5186	if (!sp->role.cr4_pae) {
				5187	page_offset <<= 1; /* 32->64 */
				5188	/*
				5189	* A 32-bit pde maps 4MB while the shadow pdes map
				5190	* only 2MB. So we need to double the offset again
				5191	* and zap two pdes instead of one.
				5192	*/
				5193	if (level == PT32_ROOT_LEVEL) {
				5194	page_offset &= ~7; /* kill rounding error */
				5195	page_offset <<= 1;
				5196	*nspte = 2;
				5197	}
				5198	quadrant = page_offset >> PAGE_SHIFT;
				5199	page_offset &= ~PAGE_MASK;
				5200	if (quadrant != sp->role.quadrant)
				5201	return NULL;
				5202	}
				5203
				5204	spte = &sp->spt[page_offset / sizeof(*spte)];
				5205	return spte;
				5206	}
				5207
				5208	static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
				5209	const u8 *new, int bytes,
				5210	struct kvm_page_track_notifier_node *node)
				5211	{
				5212	gfn_t gfn = gpa >> PAGE_SHIFT;
				5213	struct kvm_mmu_page *sp;
				5214	LIST_HEAD(invalid_list);
				5215	u64 entry, gentry, *spte;
				5216	int npte;
				5217	bool remote_flush, local_flush;
				5218
				5219	/*
				5220	* If we don't have indirect shadow pages, it means no page is
				5221	* write-protected, so we can exit simply.
				5222	*/
				5223	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
				5224	return;
				5225
				5226	remote_flush = local_flush = false;
				5227
				5228	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
				5229
				5230	/*
				5231	* No need to care whether allocation memory is successful
				5232	* or not since pte prefetch is skiped if it does not have
				5233	* enough objects in the cache.
				5234	*/
				5235	mmu_topup_memory_caches(vcpu);
				5236
				5237	spin_lock(&vcpu->kvm->mmu_lock);
				5238
				5239	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
				5240
				5241	++vcpu->kvm->stat.mmu_pte_write;
				5242	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
				5243
				5244	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
				5245	if (detect_write_misaligned(sp, gpa, bytes) \|\|
				5246	detect_write_flooding(sp)) {
				5247	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
				5248	++vcpu->kvm->stat.mmu_flooded;
				5249	continue;
				5250	}
				5251
				5252	spte = get_written_sptes(sp, gpa, &npte);
				5253	if (!spte)
				5254	continue;
				5255
				5256	local_flush = true;
				5257	while (npte--) {
				5258	entry = *spte;
				5259	mmu_page_zap_pte(vcpu->kvm, sp, spte);
				5260	if (gentry &&
				5261	!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
				5262	& mmu_base_role_mask.word) && rmap_can_add(vcpu))
				5263	mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
				5264	if (need_remote_flush(entry, *spte))
				5265	remote_flush = true;
				5266	++spte;
				5267	}
				5268	}
				5269	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
				5270	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
				5271	spin_unlock(&vcpu->kvm->mmu_lock);
				5272	}
				5273
				5274	int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
				5275	{
				5276	gpa_t gpa;
				5277	int r;
				5278
				5279	if (vcpu->arch.mmu.direct_map)
				5280	return 0;
				5281
				5282	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
				5283
				5284	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
				5285
				5286	return r;
				5287	}
				5288	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
				5289
				5290	static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
				5291	{
				5292	LIST_HEAD(invalid_list);
				5293
				5294	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
				5295	return 0;
				5296
				5297	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
				5298	if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
				5299	break;
				5300
				5301	++vcpu->kvm->stat.mmu_recycled;
				5302	}
				5303	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				5304
				5305	if (!kvm_mmu_available_pages(vcpu->kvm))
				5306	return -ENOSPC;
				5307	return 0;
				5308	}
				5309
				5310	int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
				5311	void *insn, int insn_len)
				5312	{
				5313	int r, emulation_type = 0;
				5314	enum emulation_result er;
				5315	bool direct = vcpu->arch.mmu.direct_map;
				5316
				5317	/* With shadow page tables, fault_address contains a GVA or nGPA. */
				5318	if (vcpu->arch.mmu.direct_map) {
				5319	vcpu->arch.gpa_available = true;
				5320	vcpu->arch.gpa_val = cr2;
				5321	}
				5322
				5323	r = RET_PF_INVALID;
				5324	if (unlikely(error_code & PFERR_RSVD_MASK)) {
				5325	r = handle_mmio_page_fault(vcpu, cr2, direct);
				5326	if (r == RET_PF_EMULATE)
				5327	goto emulate;
				5328	}
				5329
				5330	if (r == RET_PF_INVALID) {
				5331	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
				5332	false);
				5333	WARN_ON(r == RET_PF_INVALID);
				5334	}
				5335
				5336	if (r == RET_PF_RETRY)
				5337	return 1;
				5338	if (r < 0)
				5339	return r;
				5340
				5341	/*
				5342	* Before emulating the instruction, check if the error code
				5343	* was due to a RO violation while translating the guest page.
				5344	* This can occur when using nested virtualization with nested
				5345	* paging in both guests. If true, we simply unprotect the page
				5346	* and resume the guest.
				5347	*/
				5348	if (vcpu->arch.mmu.direct_map &&
				5349	(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
				5350	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
				5351	return 1;
				5352	}
				5353
				5354	/*
				5355	* vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
				5356	* optimistically try to just unprotect the page and let the processor
				5357	* re-execute the instruction that caused the page fault. Do not allow
				5358	* retrying MMIO emulation, as it's not only pointless but could also
				5359	* cause us to enter an infinite loop because the processor will keep
				5360	* faulting on the non-existent MMIO address. Retrying an instruction
				5361	* from a nested guest is also pointless and dangerous as we are only
				5362	* explicitly shadowing L1's page tables, i.e. unprotecting something
				5363	* for L1 isn't going to magically fix whatever issue cause L2 to fail.
				5364	*/
				5365	if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
				5366	emulation_type = EMULTYPE_ALLOW_RETRY;
				5367	emulate:
				5368	/*
				5369	* On AMD platforms, under certain conditions insn_len may be zero on #NPF.
				5370	* This can happen if a guest gets a page-fault on data access but the HW
				5371	* table walker is not able to read the instruction page (e.g instruction
				5372	* page is not present in memory). In those cases we simply restart the
				5373	* guest.
				5374	*/
				5375	if (unlikely(insn && !insn_len))
				5376	return 1;
				5377
				5378	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
				5379
				5380	switch (er) {
				5381	case EMULATE_DONE:
				5382	return 1;
				5383	case EMULATE_USER_EXIT:
				5384	++vcpu->stat.mmio_exits;
				5385	/* fall through */
				5386	case EMULATE_FAIL:
				5387	return 0;
				5388	default:
				5389	BUG();
				5390	}
				5391	}
				5392	EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
				5393
				5394	void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
				5395	{
				5396	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				5397	int i;
				5398
				5399	/* INVLPG on a * non-canonical address is a NOP according to the SDM. */
				5400	if (is_noncanonical_address(gva, vcpu))
				5401	return;
				5402
				5403	mmu->invlpg(vcpu, gva, mmu->root_hpa);
				5404
				5405	/*
				5406	* INVLPG is required to invalidate any global mappings for the VA,
				5407	* irrespective of PCID. Since it would take us roughly similar amount
				5408	* of work to determine whether any of the prev_root mappings of the VA
				5409	* is marked global, or to just sync it blindly, so we might as well
				5410	* just always sync it.
				5411	*
				5412	* Mappings not reachable via the current cr3 or the prev_roots will be
				5413	* synced when switching to that cr3, so nothing needs to be done here
				5414	* for them.
				5415	*/
				5416	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				5417	if (VALID_PAGE(mmu->prev_roots[i].hpa))
				5418	mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
				5419
				5420	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
				5421	++vcpu->stat.invlpg;
				5422	}
				5423	EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
				5424
				5425	void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
				5426	{
				5427	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				5428	bool tlb_flush = false;
				5429	uint i;
				5430
				5431	if (pcid == kvm_get_active_pcid(vcpu)) {
				5432	mmu->invlpg(vcpu, gva, mmu->root_hpa);
				5433	tlb_flush = true;
				5434	}
				5435
				5436	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
				5437	if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
				5438	pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
				5439	mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
				5440	tlb_flush = true;
				5441	}
				5442	}
				5443
				5444	if (tlb_flush)
				5445	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
				5446
				5447	++vcpu->stat.invlpg;
				5448
				5449	/*
				5450	* Mappings not reachable via the current cr3 or the prev_roots will be
				5451	* synced when switching to that cr3, so nothing needs to be done here
				5452	* for them.
				5453	*/
				5454	}
				5455	EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
				5456
				5457	void kvm_enable_tdp(void)
				5458	{
				5459	tdp_enabled = true;
				5460	}
				5461	EXPORT_SYMBOL_GPL(kvm_enable_tdp);
				5462
				5463	void kvm_disable_tdp(void)
				5464	{
				5465	tdp_enabled = false;
				5466	}
				5467	EXPORT_SYMBOL_GPL(kvm_disable_tdp);
				5468
				5469	static void free_mmu_pages(struct kvm_vcpu *vcpu)
				5470	{
				5471	free_page((unsigned long)vcpu->arch.mmu.pae_root);
				5472	free_page((unsigned long)vcpu->arch.mmu.lm_root);
				5473	}
				5474
				5475	static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
				5476	{
				5477	struct page *page;
				5478	int i;
				5479
				5480	/*
				5481	* When using PAE paging, the four PDPTEs are treated as 'root' pages,
				5482	* while the PDP table is a per-vCPU construct that's allocated at MMU
				5483	* creation. When emulating 32-bit mode, cr3 is only 32 bits even on
				5484	* x86_64. Therefore we need to allocate the PDP table in the first
				5485	* 4GB of memory, which happens to fit the DMA32 zone. Except for
				5486	* SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
				5487	* skip allocating the PDP table.
				5488	*/
				5489	if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
				5490	return 0;
				5491
				5492	/*
				5493	* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
				5494	* Therefore we need to allocate shadow page tables in the first
				5495	* 4GB of memory, which happens to fit the DMA32 zone.
				5496	*/
				5497	page = alloc_page(GFP_KERNEL \| __GFP_DMA32);
				5498	if (!page)
				5499	return -ENOMEM;
				5500
				5501	vcpu->arch.mmu.pae_root = page_address(page);
				5502	for (i = 0; i < 4; ++i)
				5503	vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
				5504
				5505	return 0;
				5506	}
				5507
				5508	int kvm_mmu_create(struct kvm_vcpu *vcpu)
				5509	{
				5510	uint i;
				5511
				5512	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
				5513	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				5514	vcpu->arch.mmu.translate_gpa = translate_gpa;
				5515	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
				5516
				5517	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				5518	vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
				5519
				5520	return alloc_mmu_pages(vcpu);
				5521	}
				5522
				5523	void kvm_mmu_setup(struct kvm_vcpu *vcpu)
				5524	{
				5525	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
				5526
				5527	/*
				5528	* kvm_mmu_setup() is called only on vCPU initialization.
				5529	* Therefore, no need to reset mmu roots as they are not yet
				5530	* initialized.
				5531	*/
				5532	kvm_init_mmu(vcpu, false);
				5533	}
				5534
				5535	static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
				5536	struct kvm_memory_slot *slot,
				5537	struct kvm_page_track_notifier_node *node)
				5538	{
				5539	kvm_mmu_invalidate_zap_all_pages(kvm);
				5540	}
				5541
				5542	void kvm_mmu_init_vm(struct kvm *kvm)
				5543	{
				5544	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
				5545
				5546	node->track_write = kvm_mmu_pte_write;
				5547	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
				5548	kvm_page_track_register_notifier(kvm, node);
				5549	}
				5550
				5551	void kvm_mmu_uninit_vm(struct kvm *kvm)
				5552	{
				5553	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
				5554
				5555	kvm_page_track_unregister_notifier(kvm, node);
				5556	}
				5557
				5558	/* The return value indicates if tlb flush on all vcpus is needed. */
				5559	typedef bool (slot_level_handler) (struct kvm kvm, struct kvm_rmap_head *rmap_head);
				5560
				5561	/* The caller should hold mmu-lock before calling this function. */
				5562	static __always_inline bool
				5563	slot_handle_level_range(struct kvm kvm, struct kvm_memory_slot memslot,
				5564	slot_level_handler fn, int start_level, int end_level,
				5565	gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
				5566	{
				5567	struct slot_rmap_walk_iterator iterator;
				5568	bool flush = false;
				5569
				5570	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
				5571	end_gfn, &iterator) {
				5572	if (iterator.rmap)
				5573	flush \|= fn(kvm, iterator.rmap);
				5574
				5575	if (need_resched() \|\| spin_needbreak(&kvm->mmu_lock)) {
				5576	if (flush && lock_flush_tlb) {
				5577	kvm_flush_remote_tlbs(kvm);
				5578	flush = false;
				5579	}
				5580	cond_resched_lock(&kvm->mmu_lock);
				5581	}
				5582	}
				5583
				5584	if (flush && lock_flush_tlb) {
				5585	kvm_flush_remote_tlbs(kvm);
				5586	flush = false;
				5587	}
				5588
				5589	return flush;
				5590	}
				5591
				5592	static __always_inline bool
				5593	slot_handle_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5594	slot_level_handler fn, int start_level, int end_level,
				5595	bool lock_flush_tlb)
				5596	{
				5597	return slot_handle_level_range(kvm, memslot, fn, start_level,
				5598	end_level, memslot->base_gfn,
				5599	memslot->base_gfn + memslot->npages - 1,
				5600	lock_flush_tlb);
				5601	}
				5602
				5603	static __always_inline bool
				5604	slot_handle_all_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5605	slot_level_handler fn, bool lock_flush_tlb)
				5606	{
				5607	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
				5608	PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
				5609	}
				5610
				5611	static __always_inline bool
				5612	slot_handle_large_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5613	slot_level_handler fn, bool lock_flush_tlb)
				5614	{
				5615	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
				5616	PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
				5617	}
				5618
				5619	static __always_inline bool
				5620	slot_handle_leaf(struct kvm kvm, struct kvm_memory_slot memslot,
				5621	slot_level_handler fn, bool lock_flush_tlb)
				5622	{
				5623	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
				5624	PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
				5625	}
				5626
				5627	void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
				5628	{
				5629	struct kvm_memslots *slots;
				5630	struct kvm_memory_slot *memslot;
				5631	int i;
				5632
				5633	spin_lock(&kvm->mmu_lock);
				5634	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				5635	slots = __kvm_memslots(kvm, i);
				5636	kvm_for_each_memslot(memslot, slots) {
				5637	gfn_t start, end;
				5638
				5639	start = max(gfn_start, memslot->base_gfn);
				5640	end = min(gfn_end, memslot->base_gfn + memslot->npages);
				5641	if (start >= end)
				5642	continue;
				5643
				5644	slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
				5645	PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
				5646	start, end - 1, true);
				5647	}
				5648	}
				5649
				5650	spin_unlock(&kvm->mmu_lock);
				5651	}
				5652
				5653	static bool slot_rmap_write_protect(struct kvm *kvm,
				5654	struct kvm_rmap_head *rmap_head)
				5655	{
				5656	return __rmap_write_protect(kvm, rmap_head, false);
				5657	}
				5658
				5659	void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
				5660	struct kvm_memory_slot *memslot)
				5661	{
				5662	bool flush;
				5663
				5664	spin_lock(&kvm->mmu_lock);
				5665	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
				5666	false);
				5667	spin_unlock(&kvm->mmu_lock);
				5668
				5669	/*
				5670	* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
				5671	* which do tlb flush out of mmu-lock should be serialized by
				5672	* kvm->slots_lock otherwise tlb flush would be missed.
				5673	*/
				5674	lockdep_assert_held(&kvm->slots_lock);
				5675
				5676	/*
				5677	* We can flush all the TLBs out of the mmu lock without TLB
				5678	* corruption since we just change the spte from writable to
				5679	* readonly so that we only need to care the case of changing
				5680	* spte from present to present (changing the spte from present
				5681	* to nonpresent will flush all the TLBs immediately), in other
				5682	* words, the only case we care is mmu_spte_update() where we
				5683	* haved checked SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE
				5684	* instead of PT_WRITABLE_MASK, that means it does not depend
				5685	* on PT_WRITABLE_MASK anymore.
				5686	*/
				5687	if (flush)
				5688	kvm_flush_remote_tlbs(kvm);
				5689	}
				5690
				5691	static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
				5692	struct kvm_rmap_head *rmap_head)
				5693	{
				5694	u64 *sptep;
				5695	struct rmap_iterator iter;
				5696	int need_tlb_flush = 0;
				5697	kvm_pfn_t pfn;
				5698	struct kvm_mmu_page *sp;
				5699
				5700	restart:
				5701	for_each_rmap_spte(rmap_head, &iter, sptep) {
				5702	sp = page_header(__pa(sptep));
				5703	pfn = spte_to_pfn(*sptep);
				5704
				5705	/*
				5706	* We cannot do huge page mapping for indirect shadow pages,
				5707	* which are found on the last rmap (level = 1) when not using
				5708	* tdp; such shadow pages are synced with the page table in
				5709	* the guest, and the guest page table is using 4K page size
				5710	* mapping if the indirect sp has level = 1.
				5711	*/
				5712	if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
				5713	!kvm_is_zone_device_pfn(pfn) &&
				5714	PageTransCompoundMap(pfn_to_page(pfn))) {
				5715	drop_spte(kvm, sptep);
				5716	need_tlb_flush = 1;
				5717	goto restart;
				5718	}
				5719	}
				5720
				5721	return need_tlb_flush;
				5722	}
				5723
				5724	void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
				5725	const struct kvm_memory_slot *memslot)
				5726	{
				5727	/* FIXME: const-ify all uses of struct kvm_memory_slot. */
				5728	spin_lock(&kvm->mmu_lock);
				5729	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
				5730	kvm_mmu_zap_collapsible_spte, true);
				5731	spin_unlock(&kvm->mmu_lock);
				5732	}
				5733
				5734	void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
				5735	struct kvm_memory_slot *memslot)
				5736	{
				5737	bool flush;
				5738
				5739	spin_lock(&kvm->mmu_lock);
				5740	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
				5741	spin_unlock(&kvm->mmu_lock);
				5742
				5743	lockdep_assert_held(&kvm->slots_lock);
				5744
				5745	/*
				5746	* It's also safe to flush TLBs out of mmu lock here as currently this
				5747	* function is only used for dirty logging, in which case flushing TLB
				5748	* out of mmu lock also guarantees no dirty pages will be lost in
				5749	* dirty_bitmap.
				5750	*/
				5751	if (flush)
				5752	kvm_flush_remote_tlbs(kvm);
				5753	}
				5754	EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
				5755
				5756	void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
				5757	struct kvm_memory_slot *memslot)
				5758	{
				5759	bool flush;
				5760
				5761	spin_lock(&kvm->mmu_lock);
				5762	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
				5763	false);
				5764	spin_unlock(&kvm->mmu_lock);
				5765
				5766	/* see kvm_mmu_slot_remove_write_access */
				5767	lockdep_assert_held(&kvm->slots_lock);
				5768
				5769	if (flush)
				5770	kvm_flush_remote_tlbs(kvm);
				5771	}
				5772	EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
				5773
				5774	void kvm_mmu_slot_set_dirty(struct kvm *kvm,
				5775	struct kvm_memory_slot *memslot)
				5776	{
				5777	bool flush;
				5778
				5779	spin_lock(&kvm->mmu_lock);
				5780	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
				5781	spin_unlock(&kvm->mmu_lock);
				5782
				5783	lockdep_assert_held(&kvm->slots_lock);
				5784
				5785	/* see kvm_mmu_slot_leaf_clear_dirty */
				5786	if (flush)
				5787	kvm_flush_remote_tlbs(kvm);
				5788	}
				5789	EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
				5790
				5791	#define BATCH_ZAP_PAGES 10
				5792	static void kvm_zap_obsolete_pages(struct kvm *kvm)
				5793	{
				5794	struct kvm_mmu_page sp, node;
				5795	int batch = 0;
				5796
				5797	restart:
				5798	list_for_each_entry_safe_reverse(sp, node,
				5799	&kvm->arch.active_mmu_pages, link) {
				5800	int ret;
				5801
				5802	/*
				5803	* No obsolete page exists before new created page since
				5804	* active_mmu_pages is the FIFO list.
				5805	*/
				5806	if (!is_obsolete_sp(kvm, sp))
				5807	break;
				5808
				5809	/*
				5810	* Since we are reversely walking the list and the invalid
				5811	* list will be moved to the head, skip the invalid page
				5812	* can help us to avoid the infinity list walking.
				5813	*/
				5814	if (sp->role.invalid)
				5815	continue;
				5816
				5817	/*
				5818	* Need not flush tlb since we only zap the sp with invalid
				5819	* generation number.
				5820	*/
				5821	if (batch >= BATCH_ZAP_PAGES &&
				5822	cond_resched_lock(&kvm->mmu_lock)) {
				5823	batch = 0;
				5824	goto restart;
				5825	}
				5826
				5827	ret = kvm_mmu_prepare_zap_page(kvm, sp,
				5828	&kvm->arch.zapped_obsolete_pages);
				5829	batch += ret;
				5830
				5831	if (ret)
				5832	goto restart;
				5833	}
				5834
				5835	/*
				5836	* Should flush tlb before free page tables since lockless-walking
				5837	* may use the pages.
				5838	*/
				5839	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
				5840	}
				5841
				5842	/*
				5843	* Fast invalidate all shadow pages and use lock-break technique
				5844	* to zap obsolete pages.
				5845	*
				5846	* It's required when memslot is being deleted or VM is being
				5847	* destroyed, in these cases, we should ensure that KVM MMU does
				5848	* not use any resource of the being-deleted slot or all slots
				5849	* after calling the function.
				5850	*/
				5851	void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
				5852	{
				5853	spin_lock(&kvm->mmu_lock);
				5854	trace_kvm_mmu_invalidate_zap_all_pages(kvm);
				5855	kvm->arch.mmu_valid_gen++;
				5856
				5857	/*
				5858	* Notify all vcpus to reload its shadow page table
				5859	* and flush TLB. Then all vcpus will switch to new
				5860	* shadow page table with the new mmu_valid_gen.
				5861	*
				5862	* Note: we should do this under the protection of
				5863	* mmu-lock, otherwise, vcpu would purge shadow page
				5864	* but miss tlb flush.
				5865	*/
				5866	kvm_reload_remote_mmus(kvm);
				5867
				5868	kvm_zap_obsolete_pages(kvm);
				5869	spin_unlock(&kvm->mmu_lock);
				5870	}
				5871
				5872	static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
				5873	{
				5874	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
				5875	}
				5876
				5877	void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
				5878	{
				5879	gen &= MMIO_GEN_MASK;
				5880
				5881	/*
				5882	* Shift to eliminate the "update in-progress" flag, which isn't
				5883	* included in the spte's generation number.
				5884	*/
				5885	gen >>= 1;
				5886
				5887	/*
				5888	* Generation numbers are incremented in multiples of the number of
				5889	* address spaces in order to provide unique generations across all
				5890	* address spaces. Strip what is effectively the address space
				5891	* modifier prior to checking for a wrap of the MMIO generation so
				5892	* that a wrap in any address space is detected.
				5893	*/
				5894	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
				5895
				5896	/*
				5897	* The very rare case: if the MMIO generation number has wrapped,
				5898	* zap all shadow pages.
				5899	*/
				5900	if (unlikely(gen == 0)) {
				5901	kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
				5902	kvm_mmu_invalidate_zap_all_pages(kvm);
				5903	}
				5904	}
				5905
				5906	static unsigned long
				5907	mmu_shrink_scan(struct shrinker shrink, struct shrink_control sc)
				5908	{
				5909	struct kvm *kvm;
				5910	int nr_to_scan = sc->nr_to_scan;
				5911	unsigned long freed = 0;
				5912
				5913	mutex_lock(&kvm_lock);
				5914
				5915	list_for_each_entry(kvm, &vm_list, vm_list) {
				5916	int idx;
				5917	LIST_HEAD(invalid_list);
				5918
				5919	/*
				5920	* Never scan more than sc->nr_to_scan VM instances.
				5921	* Will not hit this condition practically since we do not try
				5922	* to shrink more than one VM and it is very unlikely to see
				5923	* !n_used_mmu_pages so many times.
				5924	*/
				5925	if (!nr_to_scan--)
				5926	break;
				5927	/*
				5928	* n_used_mmu_pages is accessed without holding kvm->mmu_lock
				5929	* here. We may skip a VM instance errorneosly, but we do not
				5930	* want to shrink a VM that only started to populate its MMU
				5931	* anyway.
				5932	*/
				5933	if (!kvm->arch.n_used_mmu_pages &&
				5934	!kvm_has_zapped_obsolete_pages(kvm))
				5935	continue;
				5936
				5937	idx = srcu_read_lock(&kvm->srcu);
				5938	spin_lock(&kvm->mmu_lock);
				5939
				5940	if (kvm_has_zapped_obsolete_pages(kvm)) {
				5941	kvm_mmu_commit_zap_page(kvm,
				5942	&kvm->arch.zapped_obsolete_pages);
				5943	goto unlock;
				5944	}
				5945
				5946	if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
				5947	freed++;
				5948	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				5949
				5950	unlock:
				5951	spin_unlock(&kvm->mmu_lock);
				5952	srcu_read_unlock(&kvm->srcu, idx);
				5953
				5954	/*
				5955	* unfair on small ones
				5956	* per-vm shrinkers cry out
				5957	* sadness comes quickly
				5958	*/
				5959	list_move_tail(&kvm->vm_list, &vm_list);
				5960	break;
				5961	}
				5962
				5963	mutex_unlock(&kvm_lock);
				5964	return freed;
				5965	}
				5966
				5967	static unsigned long
				5968	mmu_shrink_count(struct shrinker shrink, struct shrink_control sc)
				5969	{
				5970	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
				5971	}
				5972
				5973	static struct shrinker mmu_shrinker = {
				5974	.count_objects = mmu_shrink_count,
				5975	.scan_objects = mmu_shrink_scan,
				5976	.seeks = DEFAULT_SEEKS * 10,
				5977	};
				5978
				5979	static void mmu_destroy_caches(void)
				5980	{
				5981	kmem_cache_destroy(pte_list_desc_cache);
				5982	kmem_cache_destroy(mmu_page_header_cache);
				5983	}
				5984
				5985	static bool get_nx_auto_mode(void)
				5986	{
				5987	/* Return true when CPU has the bug, and mitigations are ON */
				5988	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
				5989	}
				5990
				5991	static void __set_nx_huge_pages(bool val)
				5992	{
				5993	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
				5994	}
				5995
				5996	static int set_nx_huge_pages(const char val, const struct kernel_param kp)
				5997	{
				5998	bool old_val = nx_huge_pages;
				5999	bool new_val;
				6000
				6001	/* In "auto" mode deploy workaround only if CPU has the bug. */
				6002	if (sysfs_streq(val, "off"))
				6003	new_val = 0;
				6004	else if (sysfs_streq(val, "force"))
				6005	new_val = 1;
				6006	else if (sysfs_streq(val, "auto"))
				6007	new_val = get_nx_auto_mode();
				6008	else if (strtobool(val, &new_val) < 0)
				6009	return -EINVAL;
				6010
				6011	__set_nx_huge_pages(new_val);
				6012
				6013	if (new_val != old_val) {
				6014	struct kvm *kvm;
				6015	int idx;
				6016
				6017	mutex_lock(&kvm_lock);
				6018
				6019	list_for_each_entry(kvm, &vm_list, vm_list) {
				6020	idx = srcu_read_lock(&kvm->srcu);
				6021	kvm_mmu_invalidate_zap_all_pages(kvm);
				6022	srcu_read_unlock(&kvm->srcu, idx);
				6023
				6024	wake_up_process(kvm->arch.nx_lpage_recovery_thread);
				6025	}
				6026	mutex_unlock(&kvm_lock);
				6027	}
				6028
				6029	return 0;
				6030	}
				6031
				6032	int kvm_mmu_module_init(void)
				6033	{
				6034	int ret = -ENOMEM;
				6035
				6036	if (nx_huge_pages == -1)
				6037	__set_nx_huge_pages(get_nx_auto_mode());
				6038
				6039	kvm_mmu_reset_all_pte_masks();
				6040
				6041	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
				6042	sizeof(struct pte_list_desc),
				6043	0, SLAB_ACCOUNT, NULL);
				6044	if (!pte_list_desc_cache)
				6045	goto out;
				6046
				6047	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
				6048	sizeof(struct kvm_mmu_page),
				6049	0, SLAB_ACCOUNT, NULL);
				6050	if (!mmu_page_header_cache)
				6051	goto out;
				6052
				6053	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
				6054	goto out;
				6055
				6056	ret = register_shrinker(&mmu_shrinker);
				6057	if (ret)
				6058	goto out;
				6059
				6060	return 0;
				6061
				6062	out:
				6063	mmu_destroy_caches();
				6064	return ret;
				6065	}
				6066
				6067	/*
				6068	* Caculate mmu pages needed for kvm.
				6069	*/
				6070	unsigned long kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
				6071	{
				6072	unsigned long nr_mmu_pages;
				6073	unsigned long nr_pages = 0;
				6074	struct kvm_memslots *slots;
				6075	struct kvm_memory_slot *memslot;
				6076	int i;
				6077
				6078	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				6079	slots = __kvm_memslots(kvm, i);
				6080
				6081	kvm_for_each_memslot(memslot, slots)
				6082	nr_pages += memslot->npages;
				6083	}
				6084
				6085	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
				6086	nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
				6087
				6088	return nr_mmu_pages;
				6089	}
				6090
				6091	void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
				6092	{
				6093	kvm_mmu_unload(vcpu);
				6094	free_mmu_pages(vcpu);
				6095	mmu_free_memory_caches(vcpu);
				6096	}
				6097
				6098	void kvm_mmu_module_exit(void)
				6099	{
				6100	mmu_destroy_caches();
				6101	percpu_counter_destroy(&kvm_total_used_mmu_pages);
				6102	unregister_shrinker(&mmu_shrinker);
				6103	mmu_audit_disable();
				6104	}
				6105
				6106	static int set_nx_huge_pages_recovery_ratio(const char val, const struct kernel_param kp)
				6107	{
				6108	unsigned int old_val;
				6109	int err;
				6110
				6111	old_val = nx_huge_pages_recovery_ratio;
				6112	err = param_set_uint(val, kp);
				6113	if (err)
				6114	return err;
				6115
				6116	if (READ_ONCE(nx_huge_pages) &&
				6117	!old_val && nx_huge_pages_recovery_ratio) {
				6118	struct kvm *kvm;
				6119
				6120	mutex_lock(&kvm_lock);
				6121
				6122	list_for_each_entry(kvm, &vm_list, vm_list)
				6123	wake_up_process(kvm->arch.nx_lpage_recovery_thread);
				6124
				6125	mutex_unlock(&kvm_lock);
				6126	}
				6127
				6128	return err;
				6129	}
				6130
				6131	static void kvm_recover_nx_lpages(struct kvm *kvm)
				6132	{
				6133	int rcu_idx;
				6134	struct kvm_mmu_page *sp;
				6135	unsigned int ratio;
				6136	LIST_HEAD(invalid_list);
				6137	ulong to_zap;
				6138
				6139	rcu_idx = srcu_read_lock(&kvm->srcu);
				6140	spin_lock(&kvm->mmu_lock);
				6141
				6142	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
				6143	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
				6144	while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
				6145	/*
				6146	* We use a separate list instead of just using active_mmu_pages
				6147	* because the number of lpage_disallowed pages is expected to
				6148	* be relatively small compared to the total.
				6149	*/
				6150	sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
				6151	struct kvm_mmu_page,
				6152	lpage_disallowed_link);
				6153	WARN_ON_ONCE(!sp->lpage_disallowed);
				6154	kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
				6155	WARN_ON_ONCE(sp->lpage_disallowed);
				6156
				6157	if (!--to_zap \|\| need_resched() \|\| spin_needbreak(&kvm->mmu_lock)) {
				6158	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				6159	if (to_zap)
				6160	cond_resched_lock(&kvm->mmu_lock);
				6161	}
				6162	}
				6163
				6164	spin_unlock(&kvm->mmu_lock);
				6165	srcu_read_unlock(&kvm->srcu, rcu_idx);
				6166	}
				6167
				6168	static long get_nx_lpage_recovery_timeout(u64 start_time)
				6169	{
				6170	return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
				6171	? start_time + 60 * HZ - get_jiffies_64()
				6172	: MAX_SCHEDULE_TIMEOUT;
				6173	}
				6174
				6175	static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
				6176	{
				6177	u64 start_time;
				6178	long remaining_time;
				6179
				6180	while (true) {
				6181	start_time = get_jiffies_64();
				6182	remaining_time = get_nx_lpage_recovery_timeout(start_time);
				6183
				6184	set_current_state(TASK_INTERRUPTIBLE);
				6185	while (!kthread_should_stop() && remaining_time > 0) {
				6186	schedule_timeout(remaining_time);
				6187	remaining_time = get_nx_lpage_recovery_timeout(start_time);
				6188	set_current_state(TASK_INTERRUPTIBLE);
				6189	}
				6190
				6191	set_current_state(TASK_RUNNING);
				6192
				6193	if (kthread_should_stop())
				6194	return 0;
				6195
				6196	kvm_recover_nx_lpages(kvm);
				6197	}
				6198	}
				6199
				6200	int kvm_mmu_post_init_vm(struct kvm *kvm)
				6201	{
				6202	int err;
				6203
				6204	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
				6205	"kvm-nx-lpage-recovery",
				6206	&kvm->arch.nx_lpage_recovery_thread);
				6207	if (!err)
				6208	kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
				6209
				6210	return err;
				6211	}
				6212
				6213	void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
				6214	{
				6215	if (kvm->arch.nx_lpage_recovery_thread)
				6216	kthread_stop(kvm->arch.nx_lpage_recovery_thread);
				6217	}