b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * Copyright (C) 2012,2013 - ARM Ltd |
| 4 | * Author: Marc Zyngier <marc.zyngier@arm.com> |
| 5 | * |
| 6 | * Derived from arch/arm/kvm/reset.c |
| 7 | * Copyright (C) 2012 - Virtual Open Systems and Columbia University |
| 8 | * Author: Christoffer Dall <c.dall@virtualopensystems.com> |
| 9 | */ |
| 10 | |
| 11 | #include <linux/errno.h> |
| 12 | #include <linux/kernel.h> |
| 13 | #include <linux/kvm_host.h> |
| 14 | #include <linux/kvm.h> |
| 15 | #include <linux/hw_breakpoint.h> |
| 16 | #include <linux/slab.h> |
| 17 | #include <linux/string.h> |
| 18 | #include <linux/types.h> |
| 19 | |
| 20 | #include <kvm/arm_arch_timer.h> |
| 21 | |
| 22 | #include <asm/cpufeature.h> |
| 23 | #include <asm/cputype.h> |
| 24 | #include <asm/fpsimd.h> |
| 25 | #include <asm/ptrace.h> |
| 26 | #include <asm/kvm_arm.h> |
| 27 | #include <asm/kvm_asm.h> |
| 28 | #include <asm/kvm_coproc.h> |
| 29 | #include <asm/kvm_emulate.h> |
| 30 | #include <asm/kvm_mmu.h> |
| 31 | #include <asm/virt.h> |
| 32 | |
| 33 | /* Maximum phys_shift supported for any VM on this host */ |
| 34 | static u32 kvm_ipa_limit; |
| 35 | |
| 36 | /* |
| 37 | * ARMv8 Reset Values |
| 38 | */ |
| 39 | static const struct kvm_regs default_regs_reset = { |
| 40 | .regs.pstate = (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | |
| 41 | PSR_F_BIT | PSR_D_BIT), |
| 42 | }; |
| 43 | |
| 44 | static const struct kvm_regs default_regs_reset32 = { |
| 45 | .regs.pstate = (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | |
| 46 | PSR_AA32_I_BIT | PSR_AA32_F_BIT), |
| 47 | }; |
| 48 | |
| 49 | static bool cpu_has_32bit_el1(void) |
| 50 | { |
| 51 | u64 pfr0; |
| 52 | |
| 53 | pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); |
| 54 | return !!(pfr0 & 0x20); |
| 55 | } |
| 56 | |
| 57 | /** |
| 58 | * kvm_arch_vm_ioctl_check_extension |
| 59 | * |
| 60 | * We currently assume that the number of HW registers is uniform |
| 61 | * across all CPUs (see cpuinfo_sanity_check). |
| 62 | */ |
| 63 | int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) |
| 64 | { |
| 65 | int r; |
| 66 | |
| 67 | switch (ext) { |
| 68 | case KVM_CAP_ARM_EL1_32BIT: |
| 69 | r = cpu_has_32bit_el1(); |
| 70 | break; |
| 71 | case KVM_CAP_GUEST_DEBUG_HW_BPS: |
| 72 | r = get_num_brps(); |
| 73 | break; |
| 74 | case KVM_CAP_GUEST_DEBUG_HW_WPS: |
| 75 | r = get_num_wrps(); |
| 76 | break; |
| 77 | case KVM_CAP_ARM_PMU_V3: |
| 78 | r = kvm_arm_support_pmu_v3(); |
| 79 | break; |
| 80 | case KVM_CAP_ARM_INJECT_SERROR_ESR: |
| 81 | r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); |
| 82 | break; |
| 83 | case KVM_CAP_SET_GUEST_DEBUG: |
| 84 | case KVM_CAP_VCPU_ATTRIBUTES: |
| 85 | r = 1; |
| 86 | break; |
| 87 | case KVM_CAP_ARM_VM_IPA_SIZE: |
| 88 | r = kvm_ipa_limit; |
| 89 | break; |
| 90 | case KVM_CAP_ARM_SVE: |
| 91 | r = system_supports_sve(); |
| 92 | break; |
| 93 | case KVM_CAP_ARM_PTRAUTH_ADDRESS: |
| 94 | case KVM_CAP_ARM_PTRAUTH_GENERIC: |
| 95 | r = has_vhe() && system_supports_address_auth() && |
| 96 | system_supports_generic_auth(); |
| 97 | break; |
| 98 | default: |
| 99 | r = 0; |
| 100 | } |
| 101 | |
| 102 | return r; |
| 103 | } |
| 104 | |
| 105 | unsigned int kvm_sve_max_vl; |
| 106 | |
| 107 | int kvm_arm_init_sve(void) |
| 108 | { |
| 109 | if (system_supports_sve()) { |
| 110 | kvm_sve_max_vl = sve_max_virtualisable_vl; |
| 111 | |
| 112 | /* |
| 113 | * The get_sve_reg()/set_sve_reg() ioctl interface will need |
| 114 | * to be extended with multiple register slice support in |
| 115 | * order to support vector lengths greater than |
| 116 | * SVE_VL_ARCH_MAX: |
| 117 | */ |
| 118 | if (WARN_ON(kvm_sve_max_vl > SVE_VL_ARCH_MAX)) |
| 119 | kvm_sve_max_vl = SVE_VL_ARCH_MAX; |
| 120 | |
| 121 | /* |
| 122 | * Don't even try to make use of vector lengths that |
| 123 | * aren't available on all CPUs, for now: |
| 124 | */ |
| 125 | if (kvm_sve_max_vl < sve_max_vl) |
| 126 | pr_warn("KVM: SVE vector length for guests limited to %u bytes\n", |
| 127 | kvm_sve_max_vl); |
| 128 | } |
| 129 | |
| 130 | return 0; |
| 131 | } |
| 132 | |
| 133 | static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) |
| 134 | { |
| 135 | if (!system_supports_sve()) |
| 136 | return -EINVAL; |
| 137 | |
| 138 | /* Verify that KVM startup enforced this when SVE was detected: */ |
| 139 | if (WARN_ON(!has_vhe())) |
| 140 | return -EINVAL; |
| 141 | |
| 142 | vcpu->arch.sve_max_vl = kvm_sve_max_vl; |
| 143 | |
| 144 | /* |
| 145 | * Userspace can still customize the vector lengths by writing |
| 146 | * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until |
| 147 | * kvm_arm_vcpu_finalize(), which freezes the configuration. |
| 148 | */ |
| 149 | vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE; |
| 150 | |
| 151 | return 0; |
| 152 | } |
| 153 | |
| 154 | /* |
| 155 | * Finalize vcpu's maximum SVE vector length, allocating |
| 156 | * vcpu->arch.sve_state as necessary. |
| 157 | */ |
| 158 | static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) |
| 159 | { |
| 160 | void *buf; |
| 161 | unsigned int vl; |
| 162 | |
| 163 | vl = vcpu->arch.sve_max_vl; |
| 164 | |
| 165 | /* |
| 166 | * Resposibility for these properties is shared between |
| 167 | * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and |
| 168 | * set_sve_vls(). Double-check here just to be sure: |
| 169 | */ |
| 170 | if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl || |
| 171 | vl > SVE_VL_ARCH_MAX)) |
| 172 | return -EIO; |
| 173 | |
| 174 | buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL); |
| 175 | if (!buf) |
| 176 | return -ENOMEM; |
| 177 | |
| 178 | vcpu->arch.sve_state = buf; |
| 179 | vcpu->arch.flags |= KVM_ARM64_VCPU_SVE_FINALIZED; |
| 180 | return 0; |
| 181 | } |
| 182 | |
| 183 | int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) |
| 184 | { |
| 185 | switch (feature) { |
| 186 | case KVM_ARM_VCPU_SVE: |
| 187 | if (!vcpu_has_sve(vcpu)) |
| 188 | return -EINVAL; |
| 189 | |
| 190 | if (kvm_arm_vcpu_sve_finalized(vcpu)) |
| 191 | return -EPERM; |
| 192 | |
| 193 | return kvm_vcpu_finalize_sve(vcpu); |
| 194 | } |
| 195 | |
| 196 | return -EINVAL; |
| 197 | } |
| 198 | |
| 199 | bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) |
| 200 | { |
| 201 | if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) |
| 202 | return false; |
| 203 | |
| 204 | return true; |
| 205 | } |
| 206 | |
| 207 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) |
| 208 | { |
| 209 | kfree(vcpu->arch.sve_state); |
| 210 | } |
| 211 | |
| 212 | static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) |
| 213 | { |
| 214 | if (vcpu_has_sve(vcpu)) |
| 215 | memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); |
| 216 | } |
| 217 | |
| 218 | static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) |
| 219 | { |
| 220 | /* Support ptrauth only if the system supports these capabilities. */ |
| 221 | if (!has_vhe()) |
| 222 | return -EINVAL; |
| 223 | |
| 224 | if (!system_supports_address_auth() || |
| 225 | !system_supports_generic_auth()) |
| 226 | return -EINVAL; |
| 227 | /* |
| 228 | * For now make sure that both address/generic pointer authentication |
| 229 | * features are requested by the userspace together. |
| 230 | */ |
| 231 | if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || |
| 232 | !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) |
| 233 | return -EINVAL; |
| 234 | |
| 235 | vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_PTRAUTH; |
| 236 | return 0; |
| 237 | } |
| 238 | |
| 239 | /** |
| 240 | * kvm_reset_vcpu - sets core registers and sys_regs to reset value |
| 241 | * @vcpu: The VCPU pointer |
| 242 | * |
| 243 | * This function finds the right table above and sets the registers on |
| 244 | * the virtual CPU struct to their architecturally defined reset |
| 245 | * values, except for registers whose reset is deferred until |
| 246 | * kvm_arm_vcpu_finalize(). |
| 247 | * |
| 248 | * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT |
| 249 | * ioctl or as part of handling a request issued by another VCPU in the PSCI |
| 250 | * handling code. In the first case, the VCPU will not be loaded, and in the |
| 251 | * second case the VCPU will be loaded. Because this function operates purely |
| 252 | * on the memory-backed valus of system registers, we want to do a full put if |
| 253 | * we were loaded (handling a request) and load the values back at the end of |
| 254 | * the function. Otherwise we leave the state alone. In both cases, we |
| 255 | * disable preemption around the vcpu reset as we would otherwise race with |
| 256 | * preempt notifiers which also call put/load. |
| 257 | */ |
| 258 | int kvm_reset_vcpu(struct kvm_vcpu *vcpu) |
| 259 | { |
| 260 | const struct kvm_regs *cpu_reset; |
| 261 | int ret; |
| 262 | bool loaded; |
| 263 | |
| 264 | /* Reset PMU outside of the non-preemptible section */ |
| 265 | kvm_pmu_vcpu_reset(vcpu); |
| 266 | |
| 267 | preempt_disable(); |
| 268 | loaded = (vcpu->cpu != -1); |
| 269 | if (loaded) |
| 270 | kvm_arch_vcpu_put(vcpu); |
| 271 | |
| 272 | if (!kvm_arm_vcpu_sve_finalized(vcpu)) { |
| 273 | if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) { |
| 274 | ret = kvm_vcpu_enable_sve(vcpu); |
| 275 | if (ret) |
| 276 | goto out; |
| 277 | } |
| 278 | } else { |
| 279 | kvm_vcpu_reset_sve(vcpu); |
| 280 | } |
| 281 | |
| 282 | if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || |
| 283 | test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) { |
| 284 | if (kvm_vcpu_enable_ptrauth(vcpu)) { |
| 285 | ret = -EINVAL; |
| 286 | goto out; |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | switch (vcpu->arch.target) { |
| 291 | default: |
| 292 | if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { |
| 293 | if (!cpu_has_32bit_el1()) { |
| 294 | ret = -EINVAL; |
| 295 | goto out; |
| 296 | } |
| 297 | cpu_reset = &default_regs_reset32; |
| 298 | } else { |
| 299 | cpu_reset = &default_regs_reset; |
| 300 | } |
| 301 | |
| 302 | break; |
| 303 | } |
| 304 | |
| 305 | /* Reset core registers */ |
| 306 | memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset)); |
| 307 | |
| 308 | /* Reset system registers */ |
| 309 | kvm_reset_sys_regs(vcpu); |
| 310 | |
| 311 | /* |
| 312 | * Additional reset state handling that PSCI may have imposed on us. |
| 313 | * Must be done after all the sys_reg reset. |
| 314 | */ |
| 315 | if (vcpu->arch.reset_state.reset) { |
| 316 | unsigned long target_pc = vcpu->arch.reset_state.pc; |
| 317 | |
| 318 | /* Gracefully handle Thumb2 entry point */ |
| 319 | if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { |
| 320 | target_pc &= ~1UL; |
| 321 | vcpu_set_thumb(vcpu); |
| 322 | } |
| 323 | |
| 324 | /* Propagate caller endianness */ |
| 325 | if (vcpu->arch.reset_state.be) |
| 326 | kvm_vcpu_set_be(vcpu); |
| 327 | |
| 328 | *vcpu_pc(vcpu) = target_pc; |
| 329 | vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0); |
| 330 | |
| 331 | vcpu->arch.reset_state.reset = false; |
| 332 | } |
| 333 | |
| 334 | /* Default workaround setup is enabled (if supported) */ |
| 335 | if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL) |
| 336 | vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG; |
| 337 | |
| 338 | /* Reset timer */ |
| 339 | ret = kvm_timer_vcpu_reset(vcpu); |
| 340 | out: |
| 341 | if (loaded) |
| 342 | kvm_arch_vcpu_load(vcpu, smp_processor_id()); |
| 343 | preempt_enable(); |
| 344 | return ret; |
| 345 | } |
| 346 | |
| 347 | void kvm_set_ipa_limit(void) |
| 348 | { |
| 349 | unsigned int ipa_max, pa_max, va_max, parange; |
| 350 | |
| 351 | parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; |
| 352 | pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); |
| 353 | |
| 354 | /* Clamp the IPA limit to the PA size supported by the kernel */ |
| 355 | ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; |
| 356 | /* |
| 357 | * Since our stage2 table is dependent on the stage1 page table code, |
| 358 | * we must always honor the following condition: |
| 359 | * |
| 360 | * Number of levels in Stage1 >= Number of levels in Stage2. |
| 361 | * |
| 362 | * So clamp the ipa limit further down to limit the number of levels. |
| 363 | * Since we can concatenate upto 16 tables at entry level, we could |
| 364 | * go upto 4bits above the maximum VA addressible with the current |
| 365 | * number of levels. |
| 366 | */ |
| 367 | va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; |
| 368 | va_max += 4; |
| 369 | |
| 370 | if (va_max < ipa_max) |
| 371 | ipa_max = va_max; |
| 372 | |
| 373 | /* |
| 374 | * If the final limit is lower than the real physical address |
| 375 | * limit of the CPUs, report the reason. |
| 376 | */ |
| 377 | if (ipa_max < pa_max) |
| 378 | pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", |
| 379 | (va_max < pa_max) ? "Virtual" : "Physical"); |
| 380 | |
| 381 | kvm_ipa_limit = ipa_max; |
| 382 | kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, |
| 383 | ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? |
| 384 | " (Reduced IPA size, limited VM/VMM compatibility)" : "")); |
| 385 | } |
| 386 | |
| 387 | /* |
| 388 | * Configure the VTCR_EL2 for this VM. The VTCR value is common |
| 389 | * across all the physical CPUs on the system. We use system wide |
| 390 | * sanitised values to fill in different fields, except for Hardware |
| 391 | * Management of Access Flags. HA Flag is set unconditionally on |
| 392 | * all CPUs, as it is safe to run with or without the feature and |
| 393 | * the bit is RES0 on CPUs that don't support it. |
| 394 | */ |
| 395 | int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) |
| 396 | { |
| 397 | u64 vtcr = VTCR_EL2_FLAGS; |
| 398 | u32 parange, phys_shift; |
| 399 | u8 lvls; |
| 400 | |
| 401 | if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) |
| 402 | return -EINVAL; |
| 403 | |
| 404 | phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); |
| 405 | if (phys_shift) { |
| 406 | if (phys_shift > kvm_ipa_limit || |
| 407 | phys_shift < 32) |
| 408 | return -EINVAL; |
| 409 | } else { |
| 410 | phys_shift = KVM_PHYS_SHIFT; |
| 411 | if (phys_shift > kvm_ipa_limit) { |
| 412 | pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", |
| 413 | current->comm); |
| 414 | return -EINVAL; |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; |
| 419 | if (parange > ID_AA64MMFR0_PARANGE_MAX) |
| 420 | parange = ID_AA64MMFR0_PARANGE_MAX; |
| 421 | vtcr |= parange << VTCR_EL2_PS_SHIFT; |
| 422 | |
| 423 | vtcr |= VTCR_EL2_T0SZ(phys_shift); |
| 424 | /* |
| 425 | * Use a minimum 2 level page table to prevent splitting |
| 426 | * host PMD huge pages at stage2. |
| 427 | */ |
| 428 | lvls = stage2_pgtable_levels(phys_shift); |
| 429 | if (lvls < 2) |
| 430 | lvls = 2; |
| 431 | vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); |
| 432 | |
| 433 | /* |
| 434 | * Enable the Hardware Access Flag management, unconditionally |
| 435 | * on all CPUs. The features is RES0 on CPUs without the support |
| 436 | * and must be ignored by the CPUs. |
| 437 | */ |
| 438 | vtcr |= VTCR_EL2_HA; |
| 439 | |
| 440 | /* Set the vmid bits */ |
| 441 | vtcr |= (kvm_get_vmid_bits() == 16) ? |
| 442 | VTCR_EL2_VS_16BIT : |
| 443 | VTCR_EL2_VS_8BIT; |
| 444 | kvm->arch.vtcr = vtcr; |
| 445 | return 0; |
| 446 | } |