Blame - marvell/linux/arch/x86/mm/fault.c - T108

blob: 21383ef7b506666cf2e2b0ca8a7aec391589d312 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 1995 Linus Torvalds
				4	* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
				5	* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
				6	*/
				7	#include <linux/sched.h> /* test_thread_flag(), ... */
				8	#include <linux/sched/task_stack.h> /* task_stack_(), ... /
				9	#include <linux/kdebug.h> /* oops_begin/end, ... */
				10	#include <linux/extable.h> /* search_exception_tables */
				11	#include <linux/memblock.h> /* max_low_pfn */
				12	#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
				13	#include <linux/mmiotrace.h> /* kmmio_handler, ... */
				14	#include <linux/perf_event.h> /* perf_sw_event */
				15	#include <linux/hugetlb.h> /* hstate_index_to_shift */
				16	#include <linux/prefetch.h> /* prefetchw */
				17	#include <linux/context_tracking.h> /* exception_enter(), ... */
				18	#include <linux/uaccess.h> /* faulthandler_disabled() */
				19	#include <linux/efi.h> /* efi_recover_from_page_fault()*/
				20	#include <linux/mm_types.h>
				21
				22	#include <asm/cpufeature.h> /* boot_cpu_has, ... */
				23	#include <asm/traps.h> /* dotraplinkage, ... */
				24	#include <asm/pgalloc.h> /* pgd_(), ... /
				25	#include <asm/fixmap.h> /* VSYSCALL_ADDR */
				26	#include <asm/vsyscall.h> /* emulate_vsyscall */
				27	#include <asm/vm86.h> /* struct vm86 */
				28	#include <asm/mmu_context.h> /* vma_pkey() */
				29	#include <asm/efi.h> /* efi_recover_from_page_fault()*/
				30	#include <asm/desc.h> /* store_idt(), ... */
				31	#include <asm/cpu_entry_area.h> /* exception stack */
				32
				33	#define CREATE_TRACE_POINTS
				34	#include <asm/trace/exceptions.h>
				35
				36	/*
				37	* Returns 0 if mmiotrace is disabled, or if the fault is not
				38	* handled by mmiotrace:
				39	*/
				40	static nokprobe_inline int
				41	kmmio_fault(struct pt_regs *regs, unsigned long addr)
				42	{
				43	if (unlikely(is_kmmio_active()))
				44	if (kmmio_handler(regs, addr) == 1)
				45	return -1;
				46	return 0;
				47	}
				48
				49	/*
				50	* Prefetch quirks:
				51	*
				52	* 32-bit mode:
				53	*
				54	* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
				55	* Check that here and ignore it.
				56	*
				57	* 64-bit mode:
				58	*
				59	* Sometimes the CPU reports invalid exceptions on prefetch.
				60	* Check that here and ignore it.
				61	*
				62	* Opcode checker based on code by Richard Brunner.
				63	*/
				64	static inline int
				65	check_prefetch_opcode(struct pt_regs regs, unsigned char instr,
				66	unsigned char opcode, int *prefetch)
				67	{
				68	unsigned char instr_hi = opcode & 0xf0;
				69	unsigned char instr_lo = opcode & 0x0f;
				70
				71	switch (instr_hi) {
				72	case 0x20:
				73	case 0x30:
				74	/*
				75	* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
				76	* In X86_64 long mode, the CPU will signal invalid
				77	* opcode if some of these prefixes are present so
				78	* X86_64 will never get here anyway
				79	*/
				80	return ((instr_lo & 7) == 0x6);
				81	#ifdef CONFIG_X86_64
				82	case 0x40:
				83	/*
				84	* In AMD64 long mode 0x40..0x4F are valid REX prefixes
				85	* Need to figure out under what instruction mode the
				86	* instruction was issued. Could check the LDT for lm,
				87	* but for now it's good enough to assume that long
				88	* mode only uses well known segments or kernel.
				89	*/
				90	return (!user_mode(regs) \|\| user_64bit_mode(regs));
				91	#endif
				92	case 0x60:
				93	/* 0x64 thru 0x67 are valid prefixes in all modes. */
				94	return (instr_lo & 0xC) == 0x4;
				95	case 0xF0:
				96	/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
				97	return !instr_lo \|\| (instr_lo>>1) == 1;
				98	case 0x00:
				99	/* Prefetch instruction is 0x0F0D or 0x0F18 */
				100	if (probe_kernel_address(instr, opcode))
				101	return 0;
				102
				103	*prefetch = (instr_lo == 0xF) &&
				104	(opcode == 0x0D \|\| opcode == 0x18);
				105	return 0;
				106	default:
				107	return 0;
				108	}
				109	}
				110
				111	static int
				112	is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
				113	{
				114	unsigned char *max_instr;
				115	unsigned char *instr;
				116	int prefetch = 0;
				117
				118	/*
				119	* If it was a exec (instruction fetch) fault on NX page, then
				120	* do not ignore the fault:
				121	*/
				122	if (error_code & X86_PF_INSTR)
				123	return 0;
				124
				125	instr = (void *)convert_ip_to_linear(current, regs);
				126	max_instr = instr + 15;
				127
				128	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
				129	return 0;
				130
				131	while (instr < max_instr) {
				132	unsigned char opcode;
				133
				134	if (probe_kernel_address(instr, opcode))
				135	break;
				136
				137	instr++;
				138
				139	if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
				140	break;
				141	}
				142	return prefetch;
				143	}
				144
				145	DEFINE_SPINLOCK(pgd_lock);
				146	LIST_HEAD(pgd_list);
				147
				148	#ifdef CONFIG_X86_32
				149	static inline pmd_t vmalloc_sync_one(pgd_t pgd, unsigned long address)
				150	{
				151	unsigned index = pgd_index(address);
				152	pgd_t *pgd_k;
				153	p4d_t p4d, p4d_k;
				154	pud_t pud, pud_k;
				155	pmd_t pmd, pmd_k;
				156
				157	pgd += index;
				158	pgd_k = init_mm.pgd + index;
				159
				160	if (!pgd_present(*pgd_k))
				161	return NULL;
				162
				163	/*
				164	* set_pgd(pgd, *pgd_k); here would be useless on PAE
				165	* and redundant with the set_pmd() on non-PAE. As would
				166	* set_p4d/set_pud.
				167	*/
				168	p4d = p4d_offset(pgd, address);
				169	p4d_k = p4d_offset(pgd_k, address);
				170	if (!p4d_present(*p4d_k))
				171	return NULL;
				172
				173	pud = pud_offset(p4d, address);
				174	pud_k = pud_offset(p4d_k, address);
				175	if (!pud_present(*pud_k))
				176	return NULL;
				177
				178	pmd = pmd_offset(pud, address);
				179	pmd_k = pmd_offset(pud_k, address);
				180
				181	if (pmd_present(pmd) != pmd_present(pmd_k))
				182	set_pmd(pmd, *pmd_k);
				183
				184	if (!pmd_present(*pmd_k))
				185	return NULL;
				186	else
				187	BUG_ON(pmd_pfn(pmd) != pmd_pfn(pmd_k));
				188
				189	return pmd_k;
				190	}
				191
				192	static void vmalloc_sync(void)
				193	{
				194	unsigned long address;
				195
				196	if (SHARED_KERNEL_PMD)
				197	return;
				198
				199	for (address = VMALLOC_START & PMD_MASK;
				200	address >= TASK_SIZE_MAX && address < VMALLOC_END;
				201	address += PMD_SIZE) {
				202	struct page *page;
				203
				204	spin_lock(&pgd_lock);
				205	list_for_each_entry(page, &pgd_list, lru) {
				206	spinlock_t *pgt_lock;
				207
				208	/* the pgt_lock only for Xen */
				209	pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
				210
				211	spin_lock(pgt_lock);
				212	vmalloc_sync_one(page_address(page), address);
				213	spin_unlock(pgt_lock);
				214	}
				215	spin_unlock(&pgd_lock);
				216	}
				217	}
				218
				219	void vmalloc_sync_mappings(void)
				220	{
				221	vmalloc_sync();
				222	}
				223
				224	void vmalloc_sync_unmappings(void)
				225	{
				226	vmalloc_sync();
				227	}
				228
				229	/*
				230	* 32-bit:
				231	*
				232	* Handle a fault on the vmalloc or module mapping area
				233	*/
				234	static noinline int vmalloc_fault(unsigned long address)
				235	{
				236	unsigned long pgd_paddr;
				237	pmd_t *pmd_k;
				238	pte_t *pte_k;
				239
				240	/* Make sure we are in vmalloc area: */
				241	if (!(address >= VMALLOC_START && address < VMALLOC_END))
				242	return -1;
				243
				244	/*
				245	* Synchronize this task's top level page-table
				246	* with the 'reference' page table.
				247	*
				248	* Do _not_ use "current" here. We might be inside
				249	* an interrupt in the middle of a task switch..
				250	*/
				251	pgd_paddr = read_cr3_pa();
				252	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
				253	if (!pmd_k)
				254	return -1;
				255
				256	if (pmd_large(*pmd_k))
				257	return 0;
				258
				259	pte_k = pte_offset_kernel(pmd_k, address);
				260	if (!pte_present(*pte_k))
				261	return -1;
				262
				263	return 0;
				264	}
				265	NOKPROBE_SYMBOL(vmalloc_fault);
				266
				267	/*
				268	* Did it hit the DOS screen memory VA from vm86 mode?
				269	*/
				270	static inline void
				271	check_v8086_mode(struct pt_regs *regs, unsigned long address,
				272	struct task_struct *tsk)
				273	{
				274	#ifdef CONFIG_VM86
				275	unsigned long bit;
				276
				277	if (!v8086_mode(regs) \|\| !tsk->thread.vm86)
				278	return;
				279
				280	bit = (address - 0xA0000) >> PAGE_SHIFT;
				281	if (bit < 32)
				282	tsk->thread.vm86->screen_bitmap \|= 1 << bit;
				283	#endif
				284	}
				285
				286	static bool low_pfn(unsigned long pfn)
				287	{
				288	return pfn < max_low_pfn;
				289	}
				290
				291	static void dump_pagetable(unsigned long address)
				292	{
				293	pgd_t *base = __va(read_cr3_pa());
				294	pgd_t *pgd = &base[pgd_index(address)];
				295	p4d_t *p4d;
				296	pud_t *pud;
				297	pmd_t *pmd;
				298	pte_t *pte;
				299
				300	#ifdef CONFIG_X86_PAE
				301	pr_info("pdpt = %016Lx ", pgd_val(pgd));
				302	if (!low_pfn(pgd_val(pgd) >> PAGE_SHIFT) \|\| !pgd_present(pgd))
				303	goto out;
				304	#define pr_pde pr_cont
				305	#else
				306	#define pr_pde pr_info
				307	#endif
				308	p4d = p4d_offset(pgd, address);
				309	pud = pud_offset(p4d, address);
				310	pmd = pmd_offset(pud, address);
				311	pr_pde("pde = %0Lx ", sizeof(pmd) 2, (u64)pmd_val(*pmd));
				312	#undef pr_pde
				313
				314	/*
				315	* We must not directly access the pte in the highpte
				316	* case if the page table is located in highmem.
				317	* And let's rather not kmap-atomic the pte, just in case
				318	* it's allocated already:
				319	*/
				320	if (!low_pfn(pmd_pfn(pmd)) \|\| !pmd_present(pmd) \|\| pmd_large(*pmd))
				321	goto out;
				322
				323	pte = pte_offset_kernel(pmd, address);
				324	pr_cont("pte = %0Lx ", sizeof(pte) 2, (u64)pte_val(*pte));
				325	out:
				326	pr_cont("\n");
				327	}
				328
				329	#else /* CONFIG_X86_64: */
				330
				331	void vmalloc_sync_mappings(void)
				332	{
				333	/*
				334	* 64-bit mappings might allocate new p4d/pud pages
				335	* that need to be propagated to all tasks' PGDs.
				336	*/
				337	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
				338	}
				339
				340	void vmalloc_sync_unmappings(void)
				341	{
				342	/*
				343	* Unmappings never allocate or free p4d/pud pages.
				344	* No work is required here.
				345	*/
				346	}
				347
				348	/*
				349	* 64-bit:
				350	*
				351	* Handle a fault on the vmalloc area
				352	*/
				353	static noinline int vmalloc_fault(unsigned long address)
				354	{
				355	pgd_t pgd, pgd_k;
				356	p4d_t p4d, p4d_k;
				357	pud_t *pud;
				358	pmd_t *pmd;
				359	pte_t *pte;
				360
				361	/* Make sure we are in vmalloc area: */
				362	if (!(address >= VMALLOC_START && address < VMALLOC_END))
				363	return -1;
				364
				365	/*
				366	* Copy kernel mappings over when needed. This can also
				367	* happen within a race in page table update. In the later
				368	* case just flush:
				369	*/
				370	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
				371	pgd_k = pgd_offset_k(address);
				372	if (pgd_none(*pgd_k))
				373	return -1;
				374
				375	if (pgtable_l5_enabled()) {
				376	if (pgd_none(*pgd)) {
				377	set_pgd(pgd, *pgd_k);
				378	arch_flush_lazy_mmu_mode();
				379	} else {
				380	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_k));
				381	}
				382	}
				383
				384	/* With 4-level paging, copying happens on the p4d level. */
				385	p4d = p4d_offset(pgd, address);
				386	p4d_k = p4d_offset(pgd_k, address);
				387	if (p4d_none(*p4d_k))
				388	return -1;
				389
				390	if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
				391	set_p4d(p4d, *p4d_k);
				392	arch_flush_lazy_mmu_mode();
				393	} else {
				394	BUG_ON(p4d_pfn(p4d) != p4d_pfn(p4d_k));
				395	}
				396
				397	BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
				398
				399	pud = pud_offset(p4d, address);
				400	if (pud_none(*pud))
				401	return -1;
				402
				403	if (pud_large(*pud))
				404	return 0;
				405
				406	pmd = pmd_offset(pud, address);
				407	if (pmd_none(*pmd))
				408	return -1;
				409
				410	if (pmd_large(*pmd))
				411	return 0;
				412
				413	pte = pte_offset_kernel(pmd, address);
				414	if (!pte_present(*pte))
				415	return -1;
				416
				417	return 0;
				418	}
				419	NOKPROBE_SYMBOL(vmalloc_fault);
				420
				421	#ifdef CONFIG_CPU_SUP_AMD
				422	static const char errata93_warning[] =
				423	KERN_ERR
				424	"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
				425	"******* Working around it, but it may cause SEGVs or burn power.\n"
				426	"******* Please consider a BIOS update.\n"
				427	"******* Disabling USB legacy in the BIOS may also help.\n";
				428	#endif
				429
				430	/*
				431	* No vm86 mode in 64-bit mode:
				432	*/
				433	static inline void
				434	check_v8086_mode(struct pt_regs *regs, unsigned long address,
				435	struct task_struct *tsk)
				436	{
				437	}
				438
				439	static int bad_address(void *p)
				440	{
				441	unsigned long dummy;
				442
				443	return probe_kernel_address((unsigned long *)p, dummy);
				444	}
				445
				446	static void dump_pagetable(unsigned long address)
				447	{
				448	pgd_t *base = __va(read_cr3_pa());
				449	pgd_t *pgd = base + pgd_index(address);
				450	p4d_t *p4d;
				451	pud_t *pud;
				452	pmd_t *pmd;
				453	pte_t *pte;
				454
				455	if (bad_address(pgd))
				456	goto bad;
				457
				458	pr_info("PGD %lx ", pgd_val(*pgd));
				459
				460	if (!pgd_present(*pgd))
				461	goto out;
				462
				463	p4d = p4d_offset(pgd, address);
				464	if (bad_address(p4d))
				465	goto bad;
				466
				467	pr_cont("P4D %lx ", p4d_val(*p4d));
				468	if (!p4d_present(p4d) \|\| p4d_large(p4d))
				469	goto out;
				470
				471	pud = pud_offset(p4d, address);
				472	if (bad_address(pud))
				473	goto bad;
				474
				475	pr_cont("PUD %lx ", pud_val(*pud));
				476	if (!pud_present(pud) \|\| pud_large(pud))
				477	goto out;
				478
				479	pmd = pmd_offset(pud, address);
				480	if (bad_address(pmd))
				481	goto bad;
				482
				483	pr_cont("PMD %lx ", pmd_val(*pmd));
				484	if (!pmd_present(pmd) \|\| pmd_large(pmd))
				485	goto out;
				486
				487	pte = pte_offset_kernel(pmd, address);
				488	if (bad_address(pte))
				489	goto bad;
				490
				491	pr_cont("PTE %lx", pte_val(*pte));
				492	out:
				493	pr_cont("\n");
				494	return;
				495	bad:
				496	pr_info("BAD\n");
				497	}
				498
				499	#endif /* CONFIG_X86_64 */
				500
				501	/*
				502	* Workaround for K8 erratum #93 & buggy BIOS.
				503	*
				504	* BIOS SMM functions are required to use a specific workaround
				505	* to avoid corruption of the 64bit RIP register on C stepping K8.
				506	*
				507	* A lot of BIOS that didn't get tested properly miss this.
				508	*
				509	* The OS sees this as a page fault with the upper 32bits of RIP cleared.
				510	* Try to work around it here.
				511	*
				512	* Note we only handle faults in kernel here.
				513	* Does nothing on 32-bit.
				514	*/
				515	static int is_errata93(struct pt_regs *regs, unsigned long address)
				516	{
				517	#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
				518	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
				519	\|\| boot_cpu_data.x86 != 0xf)
				520	return 0;
				521
				522	if (address != regs->ip)
				523	return 0;
				524
				525	if ((address >> 32) != 0)
				526	return 0;
				527
				528	address \|= 0xffffffffUL << 32;
				529	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
				530	(address >= MODULES_VADDR && address <= MODULES_END)) {
				531	printk_once(errata93_warning);
				532	regs->ip = address;
				533	return 1;
				534	}
				535	#endif
				536	return 0;
				537	}
				538
				539	/*
				540	* Work around K8 erratum #100 K8 in compat mode occasionally jumps
				541	* to illegal addresses >4GB.
				542	*
				543	* We catch this in the page fault handler because these addresses
				544	* are not reachable. Just detect this case and return. Any code
				545	* segment in LDT is compatibility mode.
				546	*/
				547	static int is_errata100(struct pt_regs *regs, unsigned long address)
				548	{
				549	#ifdef CONFIG_X86_64
				550	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) && (address >> 32))
				551	return 1;
				552	#endif
				553	return 0;
				554	}
				555
				556	static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
				557	{
				558	#ifdef CONFIG_X86_F00F_BUG
				559	unsigned long nr;
				560
				561	/*
				562	* Pentium F0 0F C7 C8 bug workaround:
				563	*/
				564	if (boot_cpu_has_bug(X86_BUG_F00F)) {
				565	nr = (address - idt_descr.address) >> 3;
				566
				567	if (nr == 6) {
				568	do_invalid_op(regs, 0);
				569	return 1;
				570	}
				571	}
				572	#endif
				573	return 0;
				574	}
				575
				576	static void show_ldttss(const struct desc_ptr gdt, const char name, u16 index)
				577	{
				578	u32 offset = (index >> 3) * sizeof(struct desc_struct);
				579	unsigned long addr;
				580	struct ldttss_desc desc;
				581
				582	if (index == 0) {
				583	pr_alert("%s: NULL\n", name);
				584	return;
				585	}
				586
				587	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
				588	pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
				589	return;
				590	}
				591
				592	if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
				593	sizeof(struct ldttss_desc))) {
				594	pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
				595	name, index);
				596	return;
				597	}
				598
				599	addr = desc.base0 \| (desc.base1 << 16) \| ((unsigned long)desc.base2 << 24);
				600	#ifdef CONFIG_X86_64
				601	addr \|= ((u64)desc.base3 << 32);
				602	#endif
				603	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
				604	name, index, addr, (desc.limit0 \| (desc.limit1 << 16)));
				605	}
				606
				607	static void
				608	show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
				609	{
				610	if (!oops_may_print())
				611	return;
				612
				613	if (error_code & X86_PF_INSTR) {
				614	unsigned int level;
				615	pgd_t *pgd;
				616	pte_t *pte;
				617
				618	pgd = __va(read_cr3_pa());
				619	pgd += pgd_index(address);
				620
				621	pte = lookup_address_in_pgd(pgd, address, &level);
				622
				623	if (pte && pte_present(pte) && !pte_exec(pte))
				624	pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
				625	from_kuid(&init_user_ns, current_uid()));
				626	if (pte && pte_present(pte) && pte_exec(pte) &&
				627	(pgd_flags(*pgd) & _PAGE_USER) &&
				628	(__read_cr4() & X86_CR4_SMEP))
				629	pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
				630	from_kuid(&init_user_ns, current_uid()));
				631	}
				632
				633	if (address < PAGE_SIZE && !user_mode(regs))
				634	pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
				635	(void *)address);
				636	else
				637	pr_alert("BUG: unable to handle page fault for address: %px\n",
				638	(void *)address);
				639
				640	pr_alert("#PF: %s %s in %s mode\n",
				641	(error_code & X86_PF_USER) ? "user" : "supervisor",
				642	(error_code & X86_PF_INSTR) ? "instruction fetch" :
				643	(error_code & X86_PF_WRITE) ? "write access" :
				644	"read access",
				645	user_mode(regs) ? "user" : "kernel");
				646	pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
				647	!(error_code & X86_PF_PROT) ? "not-present page" :
				648	(error_code & X86_PF_RSVD) ? "reserved bit violation" :
				649	(error_code & X86_PF_PK) ? "protection keys violation" :
				650	"permissions violation");
				651
				652	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
				653	struct desc_ptr idt, gdt;
				654	u16 ldtr, tr;
				655
				656	/*
				657	* This can happen for quite a few reasons. The more obvious
				658	* ones are faults accessing the GDT, or LDT. Perhaps
				659	* surprisingly, if the CPU tries to deliver a benign or
				660	* contributory exception from user code and gets a page fault
				661	* during delivery, the page fault can be delivered as though
				662	* it originated directly from user code. This could happen
				663	* due to wrong permissions on the IDT, GDT, LDT, TSS, or
				664	* kernel or IST stack.
				665	*/
				666	store_idt(&idt);
				667
				668	/* Usable even on Xen PV -- it's just slow. */
				669	native_store_gdt(&gdt);
				670
				671	pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
				672	idt.address, idt.size, gdt.address, gdt.size);
				673
				674	store_ldt(ldtr);
				675	show_ldttss(&gdt, "LDTR", ldtr);
				676
				677	store_tr(tr);
				678	show_ldttss(&gdt, "TR", tr);
				679	}
				680
				681	dump_pagetable(address);
				682	}
				683
				684	static noinline void
				685	pgtable_bad(struct pt_regs *regs, unsigned long error_code,
				686	unsigned long address)
				687	{
				688	struct task_struct *tsk;
				689	unsigned long flags;
				690	int sig;
				691
				692	flags = oops_begin();
				693	tsk = current;
				694	sig = SIGKILL;
				695
				696	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
				697	tsk->comm, address);
				698	dump_pagetable(address);
				699
				700	if (__die("Bad pagetable", regs, error_code))
				701	sig = 0;
				702
				703	oops_end(flags, regs, sig);
				704	}
				705
				706	static void set_signal_archinfo(unsigned long address,
				707	unsigned long error_code)
				708	{
				709	struct task_struct *tsk = current;
				710
				711	/*
				712	* To avoid leaking information about the kernel page
				713	* table layout, pretend that user-mode accesses to
				714	* kernel addresses are always protection faults.
				715	*
				716	* NB: This means that failed vsyscalls with vsyscall=none
				717	* will have the PROT bit. This doesn't leak any
				718	* information and does not appear to cause any problems.
				719	*/
				720	if (address >= TASK_SIZE_MAX)
				721	error_code \|= X86_PF_PROT;
				722
				723	tsk->thread.trap_nr = X86_TRAP_PF;
				724	tsk->thread.error_code = error_code \| X86_PF_USER;
				725	tsk->thread.cr2 = address;
				726	}
				727
				728	static noinline void
				729	no_context(struct pt_regs *regs, unsigned long error_code,
				730	unsigned long address, int signal, int si_code)
				731	{
				732	struct task_struct *tsk = current;
				733	unsigned long flags;
				734	int sig;
				735
				736	if (user_mode(regs)) {
				737	/*
				738	* This is an implicit supervisor-mode access from user
				739	* mode. Bypass all the kernel-mode recovery code and just
				740	* OOPS.
				741	*/
				742	goto oops;
				743	}
				744
				745	/* Are we prepared to handle this kernel fault? */
				746	if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
				747	return;
				748
				749	#ifdef CONFIG_VMAP_STACK
				750	/*
				751	* Stack overflow? During boot, we can fault near the initial
				752	* stack in the direct map, but that's not an overflow -- check
				753	* that we're in vmalloc space to avoid this.
				754	*/
				755	if (is_vmalloc_addr((void *)address) &&
				756	(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) \|\|
				757	address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
				758	unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
				759	/*
				760	* We're likely to be running with very little stack space
				761	* left. It's plausible that we'd hit this condition but
				762	* double-fault even before we get this far, in which case
				763	* we're fine: the double-fault handler will deal with it.
				764	*
				765	* We don't want to make it all the way into the oops code
				766	* and then double-fault, though, because we're likely to
				767	* break the console driver and lose most of the stack dump.
				768	*/
				769	asm volatile ("movq %[stack], %%rsp\n\t"
				770	"call handle_stack_overflow\n\t"
				771	"1: jmp 1b"
				772	: ASM_CALL_CONSTRAINT
				773	: "D" ("kernel stack overflow (page fault)"),
				774	"S" (regs), "d" (address),
				775	[stack] "rm" (stack));
				776	unreachable();
				777	}
				778	#endif
				779
				780	/*
				781	* 32-bit:
				782	*
				783	* Valid to do another page fault here, because if this fault
				784	* had been triggered by is_prefetch fixup_exception would have
				785	* handled it.
				786	*
				787	* 64-bit:
				788	*
				789	* Hall of shame of CPU/BIOS bugs.
				790	*/
				791	if (is_prefetch(regs, error_code, address))
				792	return;
				793
				794	if (is_errata93(regs, address))
				795	return;
				796
				797	/*
				798	* Buggy firmware could access regions which might page fault, try to
				799	* recover from such faults.
				800	*/
				801	if (IS_ENABLED(CONFIG_EFI))
				802	efi_recover_from_page_fault(address);
				803
				804	oops:
				805	/*
				806	* Oops. The kernel tried to access some bad page. We'll have to
				807	* terminate things with extreme prejudice:
				808	*/
				809	flags = oops_begin();
				810
				811	show_fault_oops(regs, error_code, address);
				812
				813	if (task_stack_end_corrupted(tsk))
				814	printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
				815
				816	sig = SIGKILL;
				817	if (__die("Oops", regs, error_code))
				818	sig = 0;
				819
				820	/* Executive summary in case the body of the oops scrolled away */
				821	printk(KERN_DEFAULT "CR2: %016lx\n", address);
				822
				823	oops_end(flags, regs, sig);
				824	}
				825
				826	/*
				827	* Print out info about fatal segfaults, if the show_unhandled_signals
				828	* sysctl is set:
				829	*/
				830	static inline void
				831	show_signal_msg(struct pt_regs *regs, unsigned long error_code,
				832	unsigned long address, struct task_struct *tsk)
				833	{
				834	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
				835
				836	if (!unhandled_signal(tsk, SIGSEGV))
				837	return;
				838
				839	if (!printk_ratelimit())
				840	return;
				841
				842	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
				843	loglvl, tsk->comm, task_pid_nr(tsk), address,
				844	(void )regs->ip, (void )regs->sp, error_code);
				845
				846	print_vma_addr(KERN_CONT " in ", regs->ip);
				847
				848	printk(KERN_CONT "\n");
				849
				850	show_opcodes(regs, loglvl);
				851	}
				852
				853	/*
				854	* The (legacy) vsyscall page is the long page in the kernel portion
				855	* of the address space that has user-accessible permissions.
				856	*/
				857	static bool is_vsyscall_vaddr(unsigned long vaddr)
				858	{
				859	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
				860	}
				861
				862	static void
				863	__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
				864	unsigned long address, u32 pkey, int si_code)
				865	{
				866	struct task_struct *tsk = current;
				867
				868	/* User mode accesses just cause a SIGSEGV */
				869	if (user_mode(regs) && (error_code & X86_PF_USER)) {
				870	/*
				871	* It's possible to have interrupts off here:
				872	*/
				873	local_irq_enable();
				874
				875	/*
				876	* Valid to do another page fault here because this one came
				877	* from user space:
				878	*/
				879	if (is_prefetch(regs, error_code, address))
				880	return;
				881
				882	if (is_errata100(regs, address))
				883	return;
				884
				885	/*
				886	* To avoid leaking information about the kernel page table
				887	* layout, pretend that user-mode accesses to kernel addresses
				888	* are always protection faults.
				889	*/
				890	if (address >= TASK_SIZE_MAX)
				891	error_code \|= X86_PF_PROT;
				892
				893	if (likely(show_unhandled_signals))
				894	show_signal_msg(regs, error_code, address, tsk);
				895
				896	set_signal_archinfo(address, error_code);
				897
				898	if (si_code == SEGV_PKUERR)
				899	force_sig_pkuerr((void __user *)address, pkey);
				900
				901	force_sig_fault(SIGSEGV, si_code, (void __user *)address);
				902
				903	return;
				904	}
				905
				906	if (is_f00f_bug(regs, address))
				907	return;
				908
				909	no_context(regs, error_code, address, SIGSEGV, si_code);
				910	}
				911
				912	static noinline void
				913	bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
				914	unsigned long address)
				915	{
				916	__bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
				917	}
				918
				919	static void
				920	__bad_area(struct pt_regs *regs, unsigned long error_code,
				921	unsigned long address, u32 pkey, int si_code)
				922	{
				923	struct mm_struct *mm = current->mm;
				924	/*
				925	* Something tried to access memory that isn't in our memory map..
				926	* Fix it, but check if it's kernel or user first..
				927	*/
				928	up_read(&mm->mmap_sem);
				929
				930	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
				931	}
				932
				933	static noinline void
				934	bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
				935	{
				936	__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
				937	}
				938
				939	static inline bool bad_area_access_from_pkeys(unsigned long error_code,
				940	struct vm_area_struct *vma)
				941	{
				942	/* This code is always called on the current mm */
				943	bool foreign = false;
				944
				945	if (!boot_cpu_has(X86_FEATURE_OSPKE))
				946	return false;
				947	if (error_code & X86_PF_PK)
				948	return true;
				949	/* this checks permission keys on the VMA: */
				950	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
				951	(error_code & X86_PF_INSTR), foreign))
				952	return true;
				953	return false;
				954	}
				955
				956	static noinline void
				957	bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
				958	unsigned long address, struct vm_area_struct *vma)
				959	{
				960	/*
				961	* This OSPKE check is not strictly necessary at runtime.
				962	* But, doing it this way allows compiler optimizations
				963	* if pkeys are compiled out.
				964	*/
				965	if (bad_area_access_from_pkeys(error_code, vma)) {
				966	/*
				967	* A protection key fault means that the PKRU value did not allow
				968	* access to some PTE. Userspace can figure out what PKRU was
				969	* from the XSAVE state. This function captures the pkey from
				970	* the vma and passes it to userspace so userspace can discover
				971	* which protection key was set on the PTE.
				972	*
				973	* If we get here, we know that the hardware signaled a X86_PF_PK
				974	* fault and that there was a VMA once we got in the fault
				975	* handler. It does not guarantee that the VMA we find here
				976	* was the one that we faulted on.
				977	*
				978	* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
				979	* 2. T1 : set PKRU to deny access to pkey=4, touches page
				980	* 3. T1 : faults...
				981	* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
				982	* 5. T1 : enters fault handler, takes mmap_sem, etc...
				983	* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
				984	* faulted on a pte with its pkey=4.
				985	*/
				986	u32 pkey = vma_pkey(vma);
				987
				988	__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
				989	} else {
				990	__bad_area(regs, error_code, address, 0, SEGV_ACCERR);
				991	}
				992	}
				993
				994	static void
				995	do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
				996	vm_fault_t fault)
				997	{
				998	/* Kernel mode? Handle exceptions or die: */
				999	if (!(error_code & X86_PF_USER)) {
				1000	no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
				1001	return;
				1002	}
				1003
				1004	/* User-space => ok to do another page fault: */
				1005	if (is_prefetch(regs, error_code, address))
				1006	return;
				1007
				1008	set_signal_archinfo(address, error_code);
				1009
				1010	#ifdef CONFIG_MEMORY_FAILURE
				1011	if (fault & (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE)) {
				1012	struct task_struct *tsk = current;
				1013	unsigned lsb = 0;
				1014
				1015	pr_err(
				1016	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
				1017	tsk->comm, tsk->pid, address);
				1018	if (fault & VM_FAULT_HWPOISON_LARGE)
				1019	lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
				1020	if (fault & VM_FAULT_HWPOISON)
				1021	lsb = PAGE_SHIFT;
				1022	force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
				1023	return;
				1024	}
				1025	#endif
				1026	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
				1027	}
				1028
				1029	static noinline void
				1030	mm_fault_error(struct pt_regs *regs, unsigned long error_code,
				1031	unsigned long address, vm_fault_t fault)
				1032	{
				1033	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
				1034	no_context(regs, error_code, address, 0, 0);
				1035	return;
				1036	}
				1037
				1038	if (fault & VM_FAULT_OOM) {
				1039	/* Kernel mode? Handle exceptions or die: */
				1040	if (!(error_code & X86_PF_USER)) {
				1041	no_context(regs, error_code, address,
				1042	SIGSEGV, SEGV_MAPERR);
				1043	return;
				1044	}
				1045
				1046	/*
				1047	* We ran out of memory, call the OOM killer, and return the
				1048	* userspace (which will retry the fault, or kill us if we got
				1049	* oom-killed):
				1050	*/
				1051	pagefault_out_of_memory();
				1052	} else {
				1053	if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON\|
				1054	VM_FAULT_HWPOISON_LARGE))
				1055	do_sigbus(regs, error_code, address, fault);
				1056	else if (fault & VM_FAULT_SIGSEGV)
				1057	bad_area_nosemaphore(regs, error_code, address);
				1058	else
				1059	BUG();
				1060	}
				1061	}
				1062
				1063	static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
				1064	{
				1065	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
				1066	return 0;
				1067
				1068	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
				1069	return 0;
				1070
				1071	return 1;
				1072	}
				1073
				1074	/*
				1075	* Handle a spurious fault caused by a stale TLB entry.
				1076	*
				1077	* This allows us to lazily refresh the TLB when increasing the
				1078	* permissions of a kernel page (RO -> RW or NX -> X). Doing it
				1079	* eagerly is very expensive since that implies doing a full
				1080	* cross-processor TLB flush, even if no stale TLB entries exist
				1081	* on other processors.
				1082	*
				1083	* Spurious faults may only occur if the TLB contains an entry with
				1084	* fewer permission than the page table entry. Non-present (P = 0)
				1085	* and reserved bit (R = 1) faults are never spurious.
				1086	*
				1087	* There are no security implications to leaving a stale TLB when
				1088	* increasing the permissions on a page.
				1089	*
				1090	* Returns non-zero if a spurious fault was handled, zero otherwise.
				1091	*
				1092	* See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
				1093	* (Optional Invalidation).
				1094	*/
				1095	static noinline int
				1096	spurious_kernel_fault(unsigned long error_code, unsigned long address)
				1097	{
				1098	pgd_t *pgd;
				1099	p4d_t *p4d;
				1100	pud_t *pud;
				1101	pmd_t *pmd;
				1102	pte_t *pte;
				1103	int ret;
				1104
				1105	/*
				1106	* Only writes to RO or instruction fetches from NX may cause
				1107	* spurious faults.
				1108	*
				1109	* These could be from user or supervisor accesses but the TLB
				1110	* is only lazily flushed after a kernel mapping protection
				1111	* change, so user accesses are not expected to cause spurious
				1112	* faults.
				1113	*/
				1114	if (error_code != (X86_PF_WRITE \| X86_PF_PROT) &&
				1115	error_code != (X86_PF_INSTR \| X86_PF_PROT))
				1116	return 0;
				1117
				1118	pgd = init_mm.pgd + pgd_index(address);
				1119	if (!pgd_present(*pgd))
				1120	return 0;
				1121
				1122	p4d = p4d_offset(pgd, address);
				1123	if (!p4d_present(*p4d))
				1124	return 0;
				1125
				1126	if (p4d_large(*p4d))
				1127	return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
				1128
				1129	pud = pud_offset(p4d, address);
				1130	if (!pud_present(*pud))
				1131	return 0;
				1132
				1133	if (pud_large(*pud))
				1134	return spurious_kernel_fault_check(error_code, (pte_t *) pud);
				1135
				1136	pmd = pmd_offset(pud, address);
				1137	if (!pmd_present(*pmd))
				1138	return 0;
				1139
				1140	if (pmd_large(*pmd))
				1141	return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
				1142
				1143	pte = pte_offset_kernel(pmd, address);
				1144	if (!pte_present(*pte))
				1145	return 0;
				1146
				1147	ret = spurious_kernel_fault_check(error_code, pte);
				1148	if (!ret)
				1149	return 0;
				1150
				1151	/*
				1152	* Make sure we have permissions in PMD.
				1153	* If not, then there's a bug in the page tables:
				1154	*/
				1155	ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
				1156	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
				1157
				1158	return ret;
				1159	}
				1160	NOKPROBE_SYMBOL(spurious_kernel_fault);
				1161
				1162	int show_unhandled_signals = 1;
				1163
				1164	static inline int
				1165	access_error(unsigned long error_code, struct vm_area_struct *vma)
				1166	{
				1167	/* This is only called for the current mm, so: */
				1168	bool foreign = false;
				1169
				1170	/*
				1171	* Read or write was blocked by protection keys. This is
				1172	* always an unconditional error and can never result in
				1173	* a follow-up action to resolve the fault, like a COW.
				1174	*/
				1175	if (error_code & X86_PF_PK)
				1176	return 1;
				1177
				1178	/*
				1179	* Make sure to check the VMA so that we do not perform
				1180	* faults just to hit a X86_PF_PK as soon as we fill in a
				1181	* page.
				1182	*/
				1183	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
				1184	(error_code & X86_PF_INSTR), foreign))
				1185	return 1;
				1186
				1187	if (error_code & X86_PF_WRITE) {
				1188	/* write, present and write, not present: */
				1189	if (unlikely(!(vma->vm_flags & VM_WRITE)))
				1190	return 1;
				1191	return 0;
				1192	}
				1193
				1194	/* read, present: */
				1195	if (unlikely(error_code & X86_PF_PROT))
				1196	return 1;
				1197
				1198	/* read, not present: */
				1199	if (unlikely(!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE))))
				1200	return 1;
				1201
				1202	return 0;
				1203	}
				1204
				1205	static int fault_in_kernel_space(unsigned long address)
				1206	{
				1207	/*
				1208	* On 64-bit systems, the vsyscall page is at an address above
				1209	* TASK_SIZE_MAX, but is not considered part of the kernel
				1210	* address space.
				1211	*/
				1212	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
				1213	return false;
				1214
				1215	return address >= TASK_SIZE_MAX;
				1216	}
				1217
				1218	/*
				1219	* Called for all faults where 'address' is part of the kernel address
				1220	* space. Might get called for faults that originate from code that
				1221	* ran in userspace or the kernel.
				1222	*/
				1223	static void
				1224	do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
				1225	unsigned long address)
				1226	{
				1227	/*
				1228	* Protection keys exceptions only happen on user pages. We
				1229	* have no user pages in the kernel portion of the address
				1230	* space, so do not expect them here.
				1231	*/
				1232	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
				1233
				1234	/*
				1235	* We can fault-in kernel-space virtual memory on-demand. The
				1236	* 'reference' page table is init_mm.pgd.
				1237	*
				1238	* NOTE! We MUST NOT take any locks for this case. We may
				1239	* be in an interrupt or a critical region, and should
				1240	* only copy the information from the master page table,
				1241	* nothing more.
				1242	*
				1243	* Before doing this on-demand faulting, ensure that the
				1244	* fault is not any of the following:
				1245	* 1. A fault on a PTE with a reserved bit set.
				1246	* 2. A fault caused by a user-mode access. (Do not demand-
				1247	* fault kernel memory due to user-mode accesses).
				1248	* 3. A fault caused by a page-level protection violation.
				1249	* (A demand fault would be on a non-present page which
				1250	* would have X86_PF_PROT==0).
				1251	*/
				1252	if (!(hw_error_code & (X86_PF_RSVD \| X86_PF_USER \| X86_PF_PROT))) {
				1253	if (vmalloc_fault(address) >= 0)
				1254	return;
				1255	}
				1256
				1257	/* Was the fault spurious, caused by lazy TLB invalidation? */
				1258	if (spurious_kernel_fault(hw_error_code, address))
				1259	return;
				1260
				1261	/* kprobes don't want to hook the spurious faults: */
				1262	if (kprobe_page_fault(regs, X86_TRAP_PF))
				1263	return;
				1264
				1265	/*
				1266	* Note, despite being a "bad area", there are quite a few
				1267	* acceptable reasons to get here, such as erratum fixups
				1268	* and handling kernel code that can fault, like get_user().
				1269	*
				1270	* Don't take the mm semaphore here. If we fixup a prefetch
				1271	* fault we could otherwise deadlock:
				1272	*/
				1273	bad_area_nosemaphore(regs, hw_error_code, address);
				1274	}
				1275	NOKPROBE_SYMBOL(do_kern_addr_fault);
				1276
				1277	/* Handle faults in the user portion of the address space */
				1278	static inline
				1279	void do_user_addr_fault(struct pt_regs *regs,
				1280	unsigned long hw_error_code,
				1281	unsigned long address)
				1282	{
				1283	struct vm_area_struct *vma;
				1284	struct task_struct *tsk;
				1285	struct mm_struct *mm;
				1286	vm_fault_t fault, major = 0;
				1287	unsigned int flags = FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_KILLABLE;
				1288
				1289	tsk = current;
				1290	mm = tsk->mm;
				1291
				1292	/* kprobes don't want to hook the spurious faults: */
				1293	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
				1294	return;
				1295
				1296	/*
				1297	* Reserved bits are never expected to be set on
				1298	* entries in the user portion of the page tables.
				1299	*/
				1300	if (unlikely(hw_error_code & X86_PF_RSVD))
				1301	pgtable_bad(regs, hw_error_code, address);
				1302
				1303	/*
				1304	* If SMAP is on, check for invalid kernel (supervisor) access to user
				1305	* pages in the user address space. The odd case here is WRUSS,
				1306	* which, according to the preliminary documentation, does not respect
				1307	* SMAP and will have the USER bit set so, in all cases, SMAP
				1308	* enforcement appears to be consistent with the USER bit.
				1309	*/
				1310	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
				1311	!(hw_error_code & X86_PF_USER) &&
				1312	!(regs->flags & X86_EFLAGS_AC)))
				1313	{
				1314	bad_area_nosemaphore(regs, hw_error_code, address);
				1315	return;
				1316	}
				1317
				1318	/*
				1319	* If we're in an interrupt, have no user context or are running
				1320	* in a region with pagefaults disabled then we must not take the fault
				1321	*/
				1322	if (unlikely(faulthandler_disabled() \|\| !mm)) {
				1323	bad_area_nosemaphore(regs, hw_error_code, address);
				1324	return;
				1325	}
				1326
				1327	/*
				1328	* It's safe to allow irq's after cr2 has been saved and the
				1329	* vmalloc fault has been handled.
				1330	*
				1331	* User-mode registers count as a user access even for any
				1332	* potential system fault or CPU buglet:
				1333	*/
				1334	if (user_mode(regs)) {
				1335	local_irq_enable();
				1336	flags \|= FAULT_FLAG_USER;
				1337	} else {
				1338	if (regs->flags & X86_EFLAGS_IF)
				1339	local_irq_enable();
				1340	}
				1341
				1342	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
				1343
				1344	if (hw_error_code & X86_PF_WRITE)
				1345	flags \|= FAULT_FLAG_WRITE;
				1346	if (hw_error_code & X86_PF_INSTR)
				1347	flags \|= FAULT_FLAG_INSTRUCTION;
				1348
				1349	#ifdef CONFIG_X86_64
				1350	/*
				1351	* Faults in the vsyscall page might need emulation. The
				1352	* vsyscall page is at a high address (>PAGE_OFFSET), but is
				1353	* considered to be part of the user address space.
				1354	*
				1355	* The vsyscall page does not have a "real" VMA, so do this
				1356	* emulation before we go searching for VMAs.
				1357	*
				1358	* PKRU never rejects instruction fetches, so we don't need
				1359	* to consider the PF_PK bit.
				1360	*/
				1361	if (is_vsyscall_vaddr(address)) {
				1362	if (emulate_vsyscall(hw_error_code, regs, address))
				1363	return;
				1364	}
				1365	#endif
				1366
				1367	/*
				1368	* Kernel-mode access to the user address space should only occur
				1369	* on well-defined single instructions listed in the exception
				1370	* tables. But, an erroneous kernel fault occurring outside one of
				1371	* those areas which also holds mmap_sem might deadlock attempting
				1372	* to validate the fault against the address space.
				1373	*
				1374	* Only do the expensive exception table search when we might be at
				1375	* risk of a deadlock. This happens if we
				1376	* 1. Failed to acquire mmap_sem, and
				1377	* 2. The access did not originate in userspace.
				1378	*/
				1379	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
				1380	if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
				1381	/*
				1382	* Fault from code in kernel from
				1383	* which we do not expect faults.
				1384	*/
				1385	bad_area_nosemaphore(regs, hw_error_code, address);
				1386	return;
				1387	}
				1388	retry:
				1389	down_read(&mm->mmap_sem);
				1390	} else {
				1391	/*
				1392	* The above down_read_trylock() might have succeeded in
				1393	* which case we'll have missed the might_sleep() from
				1394	* down_read():
				1395	*/
				1396	might_sleep();
				1397	}
				1398
				1399	vma = find_vma(mm, address);
				1400	if (unlikely(!vma)) {
				1401	bad_area(regs, hw_error_code, address);
				1402	return;
				1403	}
				1404	if (likely(vma->vm_start <= address))
				1405	goto good_area;
				1406	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
				1407	bad_area(regs, hw_error_code, address);
				1408	return;
				1409	}
				1410	if (unlikely(expand_stack(vma, address))) {
				1411	bad_area(regs, hw_error_code, address);
				1412	return;
				1413	}
				1414
				1415	/*
				1416	* Ok, we have a good vm_area for this memory access, so
				1417	* we can handle it..
				1418	*/
				1419	good_area:
				1420	if (unlikely(access_error(hw_error_code, vma))) {
				1421	bad_area_access_error(regs, hw_error_code, address, vma);
				1422	return;
				1423	}
				1424
				1425	/*
				1426	* If for any reason at all we couldn't handle the fault,
				1427	* make sure we exit gracefully rather than endlessly redo
				1428	* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
				1429	* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
				1430	*
				1431	* Note that handle_userfault() may also release and reacquire mmap_sem
				1432	* (and not return with VM_FAULT_RETRY), when returning to userland to
				1433	* repeat the page fault later with a VM_FAULT_NOPAGE retval
				1434	* (potentially after handling any pending signal during the return to
				1435	* userland). The return to userland is identified whenever
				1436	* FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE are both set in flags.
				1437	*/
				1438	fault = handle_mm_fault(vma, address, flags);
				1439	major \|= fault & VM_FAULT_MAJOR;
				1440
				1441	/*
				1442	* If we need to retry the mmap_sem has already been released,
				1443	* and if there is a fatal signal pending there is no guarantee
				1444	* that we made any progress. Handle this case first.
				1445	*/
				1446	if (unlikely(fault & VM_FAULT_RETRY)) {
				1447	/* Retry at most once */
				1448	if (flags & FAULT_FLAG_ALLOW_RETRY) {
				1449	flags &= ~FAULT_FLAG_ALLOW_RETRY;
				1450	flags \|= FAULT_FLAG_TRIED;
				1451	if (!fatal_signal_pending(tsk))
				1452	goto retry;
				1453	}
				1454
				1455	/* User mode? Just return to handle the fatal exception */
				1456	if (flags & FAULT_FLAG_USER)
				1457	return;
				1458
				1459	/* Not returning to user mode? Handle exceptions or die: */
				1460	no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
				1461	return;
				1462	}
				1463
				1464	up_read(&mm->mmap_sem);
				1465	if (unlikely(fault & VM_FAULT_ERROR)) {
				1466	mm_fault_error(regs, hw_error_code, address, fault);
				1467	return;
				1468	}
				1469
				1470	/*
				1471	* Major/minor page fault accounting. If any of the events
				1472	* returned VM_FAULT_MAJOR, we account it as a major fault.
				1473	*/
				1474	if (major) {
				1475	tsk->maj_flt++;
				1476	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
				1477	} else {
				1478	tsk->min_flt++;
				1479	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
				1480	}
				1481
				1482	check_v8086_mode(regs, address, tsk);
				1483	}
				1484	NOKPROBE_SYMBOL(do_user_addr_fault);
				1485
				1486	/*
				1487	* Explicitly marked noinline such that the function tracer sees this as the
				1488	* page_fault entry point.
				1489	*/
				1490	static noinline void
				1491	__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
				1492	unsigned long address)
				1493	{
				1494	prefetchw(&current->mm->mmap_sem);
				1495
				1496	if (unlikely(kmmio_fault(regs, address)))
				1497	return;
				1498
				1499	/* Was the fault on kernel-controlled part of the address space? */
				1500	if (unlikely(fault_in_kernel_space(address)))
				1501	do_kern_addr_fault(regs, hw_error_code, address);
				1502	else
				1503	do_user_addr_fault(regs, hw_error_code, address);
				1504	}
				1505	NOKPROBE_SYMBOL(__do_page_fault);
				1506
				1507	static __always_inline void
				1508	trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
				1509	unsigned long address)
				1510	{
				1511	if (!trace_pagefault_enabled())
				1512	return;
				1513
				1514	if (user_mode(regs))
				1515	trace_page_fault_user(address, regs, error_code);
				1516	else
				1517	trace_page_fault_kernel(address, regs, error_code);
				1518	}
				1519
				1520	dotraplinkage void
				1521	do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
				1522	{
				1523	enum ctx_state prev_state;
				1524
				1525	prev_state = exception_enter();
				1526	trace_page_fault_entries(regs, error_code, address);
				1527	__do_page_fault(regs, error_code, address);
				1528	exception_exit(prev_state);
				1529	}
				1530	NOKPROBE_SYMBOL(do_page_fault);