| rjw | 1f88458 | 2022-01-06 17:20:42 +0800 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 
|  | 2 | #include <linux/jump_label.h> | 
|  | 3 | #include <asm/unwind_hints.h> | 
|  | 4 | #include <asm/cpufeatures.h> | 
|  | 5 | #include <asm/page_types.h> | 
|  | 6 | #include <asm/percpu.h> | 
|  | 7 | #include <asm/asm-offsets.h> | 
|  | 8 | #include <asm/processor-flags.h> | 
|  | 9 |  | 
|  | 10 | /* | 
|  | 11 |  | 
|  | 12 | x86 function call convention, 64-bit: | 
|  | 13 | ------------------------------------- | 
|  | 14 | arguments           |  callee-saved      | extra caller-saved | return | 
|  | 15 | [callee-clobbered]   |                    | [callee-clobbered] | | 
|  | 16 | --------------------------------------------------------------------------- | 
|  | 17 | rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11             | rax, rdx [**] | 
|  | 18 |  | 
|  | 19 | ( rsp is obviously invariant across normal function calls. (gcc can 'merge' | 
|  | 20 | functions when it sees tail-call optimization possibilities) rflags is | 
|  | 21 | clobbered. Leftover arguments are passed over the stack frame.) | 
|  | 22 |  | 
|  | 23 | [*]  In the frame-pointers case rbp is fixed to the stack frame. | 
|  | 24 |  | 
|  | 25 | [**] for struct return values wider than 64 bits the return convention is a | 
|  | 26 | bit more complex: up to 128 bits width we return small structures | 
|  | 27 | straight in rax, rdx. For structures larger than that (3 words or | 
|  | 28 | larger) the caller puts a pointer to an on-stack return struct | 
|  | 29 | [allocated in the caller's stack frame] into the first argument - i.e. | 
|  | 30 | into rdi. All other arguments shift up by one in this case. | 
|  | 31 | Fortunately this case is rare in the kernel. | 
|  | 32 |  | 
|  | 33 | For 32-bit we have the following conventions - kernel is built with | 
|  | 34 | -mregparm=3 and -freg-struct-return: | 
|  | 35 |  | 
|  | 36 | x86 function calling convention, 32-bit: | 
|  | 37 | ---------------------------------------- | 
|  | 38 | arguments         | callee-saved        | extra caller-saved | return | 
|  | 39 | [callee-clobbered] |                     | [callee-clobbered] | | 
|  | 40 | ------------------------------------------------------------------------- | 
|  | 41 | eax edx ecx        | ebx edi esi ebp [*] | <none>             | eax, edx [**] | 
|  | 42 |  | 
|  | 43 | ( here too esp is obviously invariant across normal function calls. eflags | 
|  | 44 | is clobbered. Leftover arguments are passed over the stack frame. ) | 
|  | 45 |  | 
|  | 46 | [*]  In the frame-pointers case ebp is fixed to the stack frame. | 
|  | 47 |  | 
|  | 48 | [**] We build with -freg-struct-return, which on 32-bit means similar | 
|  | 49 | semantics as on 64-bit: edx can be used for a second return value | 
|  | 50 | (i.e. covering integer and structure sizes up to 64 bits) - after that | 
|  | 51 | it gets more complex and more expensive: 3-word or larger struct returns | 
|  | 52 | get done in the caller's frame and the pointer to the return struct goes | 
|  | 53 | into regparm0, i.e. eax - the other arguments shift up and the | 
|  | 54 | function's register parameters degenerate to regparm=2 in essence. | 
|  | 55 |  | 
|  | 56 | */ | 
|  | 57 |  | 
|  | 58 | #ifdef CONFIG_X86_64 | 
|  | 59 |  | 
|  | 60 | /* | 
|  | 61 | * 64-bit system call stack frame layout defines and helpers, | 
|  | 62 | * for assembly code: | 
|  | 63 | */ | 
|  | 64 |  | 
|  | 65 | /* The layout forms the "struct pt_regs" on the stack: */ | 
|  | 66 | /* | 
|  | 67 | * C ABI says these regs are callee-preserved. They aren't saved on kernel entry | 
|  | 68 | * unless syscall needs a complete, fully filled "struct pt_regs". | 
|  | 69 | */ | 
|  | 70 | #define R15		0*8 | 
|  | 71 | #define R14		1*8 | 
|  | 72 | #define R13		2*8 | 
|  | 73 | #define R12		3*8 | 
|  | 74 | #define RBP		4*8 | 
|  | 75 | #define RBX		5*8 | 
|  | 76 | /* These regs are callee-clobbered. Always saved on kernel entry. */ | 
|  | 77 | #define R11		6*8 | 
|  | 78 | #define R10		7*8 | 
|  | 79 | #define R9		8*8 | 
|  | 80 | #define R8		9*8 | 
|  | 81 | #define RAX		10*8 | 
|  | 82 | #define RCX		11*8 | 
|  | 83 | #define RDX		12*8 | 
|  | 84 | #define RSI		13*8 | 
|  | 85 | #define RDI		14*8 | 
|  | 86 | /* | 
|  | 87 | * On syscall entry, this is syscall#. On CPU exception, this is error code. | 
|  | 88 | * On hw interrupt, it's IRQ number: | 
|  | 89 | */ | 
|  | 90 | #define ORIG_RAX	15*8 | 
|  | 91 | /* Return frame for iretq */ | 
|  | 92 | #define RIP		16*8 | 
|  | 93 | #define CS		17*8 | 
|  | 94 | #define EFLAGS		18*8 | 
|  | 95 | #define RSP		19*8 | 
|  | 96 | #define SS		20*8 | 
|  | 97 |  | 
|  | 98 | #define SIZEOF_PTREGS	21*8 | 
|  | 99 |  | 
|  | 100 | .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 | 
|  | 101 | .if \save_ret | 
|  | 102 | pushq	%rsi		/* pt_regs->si */ | 
|  | 103 | movq	8(%rsp), %rsi	/* temporarily store the return address in %rsi */ | 
|  | 104 | movq	%rdi, 8(%rsp)	/* pt_regs->di (overwriting original return address) */ | 
|  | 105 | .else | 
|  | 106 | pushq   %rdi		/* pt_regs->di */ | 
|  | 107 | pushq   %rsi		/* pt_regs->si */ | 
|  | 108 | .endif | 
|  | 109 | pushq	\rdx		/* pt_regs->dx */ | 
|  | 110 | pushq   %rcx		/* pt_regs->cx */ | 
|  | 111 | pushq   \rax		/* pt_regs->ax */ | 
|  | 112 | pushq   %r8		/* pt_regs->r8 */ | 
|  | 113 | pushq   %r9		/* pt_regs->r9 */ | 
|  | 114 | pushq   %r10		/* pt_regs->r10 */ | 
|  | 115 | pushq   %r11		/* pt_regs->r11 */ | 
|  | 116 | pushq	%rbx		/* pt_regs->rbx */ | 
|  | 117 | pushq	%rbp		/* pt_regs->rbp */ | 
|  | 118 | pushq	%r12		/* pt_regs->r12 */ | 
|  | 119 | pushq	%r13		/* pt_regs->r13 */ | 
|  | 120 | pushq	%r14		/* pt_regs->r14 */ | 
|  | 121 | pushq	%r15		/* pt_regs->r15 */ | 
|  | 122 | UNWIND_HINT_REGS | 
|  | 123 |  | 
|  | 124 | .if \save_ret | 
|  | 125 | pushq	%rsi		/* return address on top of stack */ | 
|  | 126 | .endif | 
|  | 127 |  | 
|  | 128 | /* | 
|  | 129 | * Sanitize registers of values that a speculation attack might | 
|  | 130 | * otherwise want to exploit. The lower registers are likely clobbered | 
|  | 131 | * well before they could be put to use in a speculative execution | 
|  | 132 | * gadget. | 
|  | 133 | */ | 
|  | 134 | xorl	%edx,  %edx	/* nospec dx  */ | 
|  | 135 | xorl	%ecx,  %ecx	/* nospec cx  */ | 
|  | 136 | xorl	%r8d,  %r8d	/* nospec r8  */ | 
|  | 137 | xorl	%r9d,  %r9d	/* nospec r9  */ | 
|  | 138 | xorl	%r10d, %r10d	/* nospec r10 */ | 
|  | 139 | xorl	%r11d, %r11d	/* nospec r11 */ | 
|  | 140 | xorl	%ebx,  %ebx	/* nospec rbx */ | 
|  | 141 | xorl	%ebp,  %ebp	/* nospec rbp */ | 
|  | 142 | xorl	%r12d, %r12d	/* nospec r12 */ | 
|  | 143 | xorl	%r13d, %r13d	/* nospec r13 */ | 
|  | 144 | xorl	%r14d, %r14d	/* nospec r14 */ | 
|  | 145 | xorl	%r15d, %r15d	/* nospec r15 */ | 
|  | 146 |  | 
|  | 147 | .endm | 
|  | 148 |  | 
|  | 149 | .macro POP_REGS pop_rdi=1 skip_r11rcx=0 | 
|  | 150 | popq %r15 | 
|  | 151 | popq %r14 | 
|  | 152 | popq %r13 | 
|  | 153 | popq %r12 | 
|  | 154 | popq %rbp | 
|  | 155 | popq %rbx | 
|  | 156 | .if \skip_r11rcx | 
|  | 157 | popq %rsi | 
|  | 158 | .else | 
|  | 159 | popq %r11 | 
|  | 160 | .endif | 
|  | 161 | popq %r10 | 
|  | 162 | popq %r9 | 
|  | 163 | popq %r8 | 
|  | 164 | popq %rax | 
|  | 165 | .if \skip_r11rcx | 
|  | 166 | popq %rsi | 
|  | 167 | .else | 
|  | 168 | popq %rcx | 
|  | 169 | .endif | 
|  | 170 | popq %rdx | 
|  | 171 | popq %rsi | 
|  | 172 | .if \pop_rdi | 
|  | 173 | popq %rdi | 
|  | 174 | .endif | 
|  | 175 | .endm | 
|  | 176 |  | 
|  | 177 | /* | 
|  | 178 | * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The | 
|  | 179 | * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding | 
|  | 180 | * is just setting the LSB, which makes it an invalid stack address and is also | 
|  | 181 | * a signal to the unwinder that it's a pt_regs pointer in disguise. | 
|  | 182 | * | 
|  | 183 | * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts | 
|  | 184 | * the original rbp. | 
|  | 185 | */ | 
|  | 186 | .macro ENCODE_FRAME_POINTER ptregs_offset=0 | 
|  | 187 | #ifdef CONFIG_FRAME_POINTER | 
|  | 188 | .if \ptregs_offset | 
|  | 189 | leaq \ptregs_offset(%rsp), %rbp | 
|  | 190 | .else | 
|  | 191 | mov %rsp, %rbp | 
|  | 192 | .endif | 
|  | 193 | orq	$0x1, %rbp | 
|  | 194 | #endif | 
|  | 195 | .endm | 
|  | 196 |  | 
|  | 197 | #ifdef CONFIG_PAGE_TABLE_ISOLATION | 
|  | 198 |  | 
|  | 199 | /* | 
|  | 200 | * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two | 
|  | 201 | * halves: | 
|  | 202 | */ | 
|  | 203 | #define PTI_USER_PGTABLE_BIT		PAGE_SHIFT | 
|  | 204 | #define PTI_USER_PGTABLE_MASK		(1 << PTI_USER_PGTABLE_BIT) | 
|  | 205 | #define PTI_USER_PCID_BIT		X86_CR3_PTI_PCID_USER_BIT | 
|  | 206 | #define PTI_USER_PCID_MASK		(1 << PTI_USER_PCID_BIT) | 
|  | 207 | #define PTI_USER_PGTABLE_AND_PCID_MASK  (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK) | 
|  | 208 |  | 
|  | 209 | .macro SET_NOFLUSH_BIT	reg:req | 
|  | 210 | bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg | 
|  | 211 | .endm | 
|  | 212 |  | 
|  | 213 | .macro ADJUST_KERNEL_CR3 reg:req | 
|  | 214 | ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID | 
|  | 215 | /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ | 
|  | 216 | andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg | 
|  | 217 | .endm | 
|  | 218 |  | 
|  | 219 | .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req | 
|  | 220 | ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI | 
|  | 221 | mov	%cr3, \scratch_reg | 
|  | 222 | ADJUST_KERNEL_CR3 \scratch_reg | 
|  | 223 | mov	\scratch_reg, %cr3 | 
|  | 224 | .Lend_\@: | 
|  | 225 | .endm | 
|  | 226 |  | 
|  | 227 | #define THIS_CPU_user_pcid_flush_mask   \ | 
|  | 228 | PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask | 
|  | 229 |  | 
|  | 230 | .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req | 
|  | 231 | ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI | 
|  | 232 | mov	%cr3, \scratch_reg | 
|  | 233 |  | 
|  | 234 | ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID | 
|  | 235 |  | 
|  | 236 | /* | 
|  | 237 | * Test if the ASID needs a flush. | 
|  | 238 | */ | 
|  | 239 | movq	\scratch_reg, \scratch_reg2 | 
|  | 240 | andq	$(0x7FF), \scratch_reg		/* mask ASID */ | 
|  | 241 | bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask | 
|  | 242 | jnc	.Lnoflush_\@ | 
|  | 243 |  | 
|  | 244 | /* Flush needed, clear the bit */ | 
|  | 245 | btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask | 
|  | 246 | movq	\scratch_reg2, \scratch_reg | 
|  | 247 | jmp	.Lwrcr3_pcid_\@ | 
|  | 248 |  | 
|  | 249 | .Lnoflush_\@: | 
|  | 250 | movq	\scratch_reg2, \scratch_reg | 
|  | 251 | SET_NOFLUSH_BIT \scratch_reg | 
|  | 252 |  | 
|  | 253 | .Lwrcr3_pcid_\@: | 
|  | 254 | /* Flip the ASID to the user version */ | 
|  | 255 | orq	$(PTI_USER_PCID_MASK), \scratch_reg | 
|  | 256 |  | 
|  | 257 | .Lwrcr3_\@: | 
|  | 258 | /* Flip the PGD to the user version */ | 
|  | 259 | orq     $(PTI_USER_PGTABLE_MASK), \scratch_reg | 
|  | 260 | mov	\scratch_reg, %cr3 | 
|  | 261 | .Lend_\@: | 
|  | 262 | .endm | 
|  | 263 |  | 
|  | 264 | .macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req | 
|  | 265 | pushq	%rax | 
|  | 266 | SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax | 
|  | 267 | popq	%rax | 
|  | 268 | .endm | 
|  | 269 |  | 
|  | 270 | .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req | 
|  | 271 | ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI | 
|  | 272 | movq	%cr3, \scratch_reg | 
|  | 273 | movq	\scratch_reg, \save_reg | 
|  | 274 | /* | 
|  | 275 | * Test the user pagetable bit. If set, then the user page tables | 
|  | 276 | * are active. If clear CR3 already has the kernel page table | 
|  | 277 | * active. | 
|  | 278 | */ | 
|  | 279 | bt	$PTI_USER_PGTABLE_BIT, \scratch_reg | 
|  | 280 | jnc	.Ldone_\@ | 
|  | 281 |  | 
|  | 282 | ADJUST_KERNEL_CR3 \scratch_reg | 
|  | 283 | movq	\scratch_reg, %cr3 | 
|  | 284 |  | 
|  | 285 | .Ldone_\@: | 
|  | 286 | .endm | 
|  | 287 |  | 
|  | 288 | .macro RESTORE_CR3 scratch_reg:req save_reg:req | 
|  | 289 | ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI | 
|  | 290 |  | 
|  | 291 | ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID | 
|  | 292 |  | 
|  | 293 | /* | 
|  | 294 | * KERNEL pages can always resume with NOFLUSH as we do | 
|  | 295 | * explicit flushes. | 
|  | 296 | */ | 
|  | 297 | bt	$PTI_USER_PGTABLE_BIT, \save_reg | 
|  | 298 | jnc	.Lnoflush_\@ | 
|  | 299 |  | 
|  | 300 | /* | 
|  | 301 | * Check if there's a pending flush for the user ASID we're | 
|  | 302 | * about to set. | 
|  | 303 | */ | 
|  | 304 | movq	\save_reg, \scratch_reg | 
|  | 305 | andq	$(0x7FF), \scratch_reg | 
|  | 306 | bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask | 
|  | 307 | jnc	.Lnoflush_\@ | 
|  | 308 |  | 
|  | 309 | btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask | 
|  | 310 | jmp	.Lwrcr3_\@ | 
|  | 311 |  | 
|  | 312 | .Lnoflush_\@: | 
|  | 313 | SET_NOFLUSH_BIT \save_reg | 
|  | 314 |  | 
|  | 315 | .Lwrcr3_\@: | 
|  | 316 | /* | 
|  | 317 | * The CR3 write could be avoided when not changing its value, | 
|  | 318 | * but would require a CR3 read *and* a scratch register. | 
|  | 319 | */ | 
|  | 320 | movq	\save_reg, %cr3 | 
|  | 321 | .Lend_\@: | 
|  | 322 | .endm | 
|  | 323 |  | 
|  | 324 | #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ | 
|  | 325 |  | 
|  | 326 | .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req | 
|  | 327 | .endm | 
|  | 328 | .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req | 
|  | 329 | .endm | 
|  | 330 | .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req | 
|  | 331 | .endm | 
|  | 332 | .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req | 
|  | 333 | .endm | 
|  | 334 | .macro RESTORE_CR3 scratch_reg:req save_reg:req | 
|  | 335 | .endm | 
|  | 336 |  | 
|  | 337 | #endif | 
|  | 338 |  | 
|  | 339 | /* | 
|  | 340 | * Mitigate Spectre v1 for conditional swapgs code paths. | 
|  | 341 | * | 
|  | 342 | * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to | 
|  | 343 | * prevent a speculative swapgs when coming from kernel space. | 
|  | 344 | * | 
|  | 345 | * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path, | 
|  | 346 | * to prevent the swapgs from getting speculatively skipped when coming from | 
|  | 347 | * user space. | 
|  | 348 | */ | 
|  | 349 | .macro FENCE_SWAPGS_USER_ENTRY | 
|  | 350 | ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER | 
|  | 351 | .endm | 
|  | 352 | .macro FENCE_SWAPGS_KERNEL_ENTRY | 
|  | 353 | ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL | 
|  | 354 | .endm | 
|  | 355 |  | 
|  | 356 | #endif /* CONFIG_X86_64 */ | 
|  | 357 |  | 
|  | 358 | /* | 
|  | 359 | * This does 'call enter_from_user_mode' unless we can avoid it based on | 
|  | 360 | * kernel config or using the static jump infrastructure. | 
|  | 361 | */ | 
|  | 362 | .macro CALL_enter_from_user_mode | 
|  | 363 | #ifdef CONFIG_CONTEXT_TRACKING | 
|  | 364 | #ifdef HAVE_JUMP_LABEL | 
|  | 365 | STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 | 
|  | 366 | #endif | 
|  | 367 | call enter_from_user_mode | 
|  | 368 | .Lafter_call_\@: | 
|  | 369 | #endif | 
|  | 370 | .endm |