| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 
|  | 2 | #ifndef __KVM_X86_MMU_H | 
|  | 3 | #define __KVM_X86_MMU_H | 
|  | 4 |  | 
|  | 5 | #include <linux/kvm_host.h> | 
|  | 6 | #include "kvm_cache_regs.h" | 
|  | 7 |  | 
|  | 8 | #define PT64_PT_BITS 9 | 
|  | 9 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | 
|  | 10 | #define PT32_PT_BITS 10 | 
|  | 11 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | 
|  | 12 |  | 
|  | 13 | #define PT_WRITABLE_SHIFT 1 | 
|  | 14 | #define PT_USER_SHIFT 2 | 
|  | 15 |  | 
|  | 16 | #define PT_PRESENT_MASK (1ULL << 0) | 
|  | 17 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | 
|  | 18 | #define PT_USER_MASK (1ULL << PT_USER_SHIFT) | 
|  | 19 | #define PT_PWT_MASK (1ULL << 3) | 
|  | 20 | #define PT_PCD_MASK (1ULL << 4) | 
|  | 21 | #define PT_ACCESSED_SHIFT 5 | 
|  | 22 | #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) | 
|  | 23 | #define PT_DIRTY_SHIFT 6 | 
|  | 24 | #define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) | 
|  | 25 | #define PT_PAGE_SIZE_SHIFT 7 | 
|  | 26 | #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) | 
|  | 27 | #define PT_PAT_MASK (1ULL << 7) | 
|  | 28 | #define PT_GLOBAL_MASK (1ULL << 8) | 
|  | 29 | #define PT64_NX_SHIFT 63 | 
|  | 30 | #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) | 
|  | 31 |  | 
|  | 32 | #define PT_PAT_SHIFT 7 | 
|  | 33 | #define PT_DIR_PAT_SHIFT 12 | 
|  | 34 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | 
|  | 35 |  | 
|  | 36 | #define PT32_DIR_PSE36_SIZE 4 | 
|  | 37 | #define PT32_DIR_PSE36_SHIFT 13 | 
|  | 38 | #define PT32_DIR_PSE36_MASK \ | 
|  | 39 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | 
|  | 40 |  | 
|  | 41 | #define PT64_ROOT_5LEVEL 5 | 
|  | 42 | #define PT64_ROOT_4LEVEL 4 | 
|  | 43 | #define PT32_ROOT_LEVEL 2 | 
|  | 44 | #define PT32E_ROOT_LEVEL 3 | 
|  | 45 |  | 
|  | 46 | #define PT_PDPE_LEVEL 3 | 
|  | 47 | #define PT_DIRECTORY_LEVEL 2 | 
|  | 48 | #define PT_PAGE_TABLE_LEVEL 1 | 
|  | 49 | #define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) | 
|  | 50 |  | 
|  | 51 | static inline u64 rsvd_bits(int s, int e) | 
|  | 52 | { | 
|  | 53 | if (e < s) | 
|  | 54 | return 0; | 
|  | 55 |  | 
|  | 56 | return ((1ULL << (e - s + 1)) - 1) << s; | 
|  | 57 | } | 
|  | 58 |  | 
|  | 59 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); | 
|  | 60 |  | 
|  | 61 | void | 
|  | 62 | reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | 
|  | 63 |  | 
|  | 64 | void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); | 
|  | 65 | void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); | 
|  | 66 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, | 
|  | 67 | bool accessed_dirty, gpa_t new_eptp); | 
|  | 68 | bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); | 
|  | 69 | int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, | 
|  | 70 | u64 fault_address, char *insn, int insn_len); | 
|  | 71 |  | 
|  | 72 | static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) | 
|  | 73 | { | 
|  | 74 | if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) | 
|  | 75 | return kvm->arch.n_max_mmu_pages - | 
|  | 76 | kvm->arch.n_used_mmu_pages; | 
|  | 77 |  | 
|  | 78 | return 0; | 
|  | 79 | } | 
|  | 80 |  | 
|  | 81 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | 
|  | 82 | { | 
|  | 83 | if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) | 
|  | 84 | return 0; | 
|  | 85 |  | 
|  | 86 | return kvm_mmu_load(vcpu); | 
|  | 87 | } | 
|  | 88 |  | 
|  | 89 | static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) | 
|  | 90 | { | 
|  | 91 | BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); | 
|  | 92 |  | 
|  | 93 | return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) | 
|  | 94 | ? cr3 & X86_CR3_PCID_MASK | 
|  | 95 | : 0; | 
|  | 96 | } | 
|  | 97 |  | 
|  | 98 | static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) | 
|  | 99 | { | 
|  | 100 | return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); | 
|  | 101 | } | 
|  | 102 |  | 
|  | 103 | static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) | 
|  | 104 | { | 
|  | 105 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 
|  | 106 | vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | | 
|  | 107 | kvm_get_active_pcid(vcpu)); | 
|  | 108 | } | 
|  | 109 |  | 
|  | 110 | /* | 
|  | 111 | * Currently, we have two sorts of write-protection, a) the first one | 
|  | 112 | * write-protects guest page to sync the guest modification, b) another one is | 
|  | 113 | * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences | 
|  | 114 | * between these two sorts are: | 
|  | 115 | * 1) the first case clears SPTE_MMU_WRITEABLE bit. | 
|  | 116 | * 2) the first case requires flushing tlb immediately avoiding corrupting | 
|  | 117 | *    shadow page table between all vcpus so it should be in the protection of | 
|  | 118 | *    mmu-lock. And the another case does not need to flush tlb until returning | 
|  | 119 | *    the dirty bitmap to userspace since it only write-protects the page | 
|  | 120 | *    logged in the bitmap, that means the page in the dirty bitmap is not | 
|  | 121 | *    missed, so it can flush tlb out of mmu-lock. | 
|  | 122 | * | 
|  | 123 | * So, there is the problem: the first case can meet the corrupted tlb caused | 
|  | 124 | * by another case which write-protects pages but without flush tlb | 
|  | 125 | * immediately. In order to making the first case be aware this problem we let | 
|  | 126 | * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit | 
|  | 127 | * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. | 
|  | 128 | * | 
|  | 129 | * Anyway, whenever a spte is updated (only permission and status bits are | 
|  | 130 | * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes | 
|  | 131 | * readonly, if that happens, we need to flush tlb. Fortunately, | 
|  | 132 | * mmu_spte_update() has already handled it perfectly. | 
|  | 133 | * | 
|  | 134 | * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: | 
|  | 135 | * - if we want to see if it has writable tlb entry or if the spte can be | 
|  | 136 | *   writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most | 
|  | 137 | *   case, otherwise | 
|  | 138 | * - if we fix page fault on the spte or do write-protection by dirty logging, | 
|  | 139 | *   check PT_WRITABLE_MASK. | 
|  | 140 | * | 
|  | 141 | * TODO: introduce APIs to split these two cases. | 
|  | 142 | */ | 
|  | 143 | static inline int is_writable_pte(unsigned long pte) | 
|  | 144 | { | 
|  | 145 | return pte & PT_WRITABLE_MASK; | 
|  | 146 | } | 
|  | 147 |  | 
|  | 148 | static inline bool is_write_protection(struct kvm_vcpu *vcpu) | 
|  | 149 | { | 
|  | 150 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); | 
|  | 151 | } | 
|  | 152 |  | 
|  | 153 | /* | 
|  | 154 | * Check if a given access (described through the I/D, W/R and U/S bits of a | 
|  | 155 | * page fault error code pfec) causes a permission fault with the given PTE | 
|  | 156 | * access rights (in ACC_* format). | 
|  | 157 | * | 
|  | 158 | * Return zero if the access does not fault; return the page fault error code | 
|  | 159 | * if the access faults. | 
|  | 160 | */ | 
|  | 161 | static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 
|  | 162 | unsigned pte_access, unsigned pte_pkey, | 
|  | 163 | unsigned pfec) | 
|  | 164 | { | 
|  | 165 | int cpl = kvm_x86_ops->get_cpl(vcpu); | 
|  | 166 | unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); | 
|  | 167 |  | 
|  | 168 | /* | 
|  | 169 | * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. | 
|  | 170 | * | 
|  | 171 | * If CPL = 3, SMAP applies to all supervisor-mode data accesses | 
|  | 172 | * (these are implicit supervisor accesses) regardless of the value | 
|  | 173 | * of EFLAGS.AC. | 
|  | 174 | * | 
|  | 175 | * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving | 
|  | 176 | * the result in X86_EFLAGS_AC. We then insert it in place of | 
|  | 177 | * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, | 
|  | 178 | * but it will be one in index if SMAP checks are being overridden. | 
|  | 179 | * It is important to keep this branchless. | 
|  | 180 | */ | 
|  | 181 | unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); | 
|  | 182 | int index = (pfec >> 1) + | 
|  | 183 | (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); | 
|  | 184 | bool fault = (mmu->permissions[index] >> pte_access) & 1; | 
|  | 185 | u32 errcode = PFERR_PRESENT_MASK; | 
|  | 186 |  | 
|  | 187 | WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); | 
|  | 188 | if (unlikely(mmu->pkru_mask)) { | 
|  | 189 | u32 pkru_bits, offset; | 
|  | 190 |  | 
|  | 191 | /* | 
|  | 192 | * PKRU defines 32 bits, there are 16 domains and 2 | 
|  | 193 | * attribute bits per domain in pkru.  pte_pkey is the | 
|  | 194 | * index of the protection domain, so pte_pkey * 2 is | 
|  | 195 | * is the index of the first bit for the domain. | 
|  | 196 | */ | 
|  | 197 | pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; | 
|  | 198 |  | 
|  | 199 | /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ | 
|  | 200 | offset = (pfec & ~1) + | 
|  | 201 | ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); | 
|  | 202 |  | 
|  | 203 | pkru_bits &= mmu->pkru_mask >> offset; | 
|  | 204 | errcode |= -pkru_bits & PFERR_PK_MASK; | 
|  | 205 | fault |= (pkru_bits != 0); | 
|  | 206 | } | 
|  | 207 |  | 
|  | 208 | return -(u32)fault & errcode; | 
|  | 209 | } | 
|  | 210 |  | 
|  | 211 | void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); | 
|  | 212 | void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); | 
|  | 213 |  | 
|  | 214 | void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); | 
|  | 215 | void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); | 
|  | 216 | bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, | 
|  | 217 | struct kvm_memory_slot *slot, u64 gfn); | 
|  | 218 | int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); | 
|  | 219 |  | 
|  | 220 | int kvm_mmu_post_init_vm(struct kvm *kvm); | 
|  | 221 | void kvm_mmu_pre_destroy_vm(struct kvm *kvm); | 
|  | 222 |  | 
|  | 223 | #endif |