Blame - marvell/linux/arch/arm64/mm/fault.c - T108

blob: ab5f8a698b716f6e844a8a30d177739d410d2e9b [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Based on arch/arm/mm/fault.c
				4	*
				5	* Copyright (C) 1995 Linus Torvalds
				6	* Copyright (C) 1995-2004 Russell King
				7	* Copyright (C) 2012 ARM Ltd.
				8	*/
				9
				10	#include <linux/acpi.h>
				11	#include <linux/bitfield.h>
				12	#include <linux/extable.h>
				13	#include <linux/kfence.h>
				14	#include <linux/signal.h>
				15	#include <linux/mm.h>
				16	#include <linux/hardirq.h>
				17	#include <linux/init.h>
				18	#include <linux/kprobes.h>
				19	#include <linux/uaccess.h>
				20	#include <linux/page-flags.h>
				21	#include <linux/sched/signal.h>
				22	#include <linux/sched/debug.h>
				23	#include <linux/highmem.h>
				24	#include <linux/perf_event.h>
				25	#include <linux/preempt.h>
				26	#include <linux/hugetlb.h>
				27
				28	#include <asm/acpi.h>
				29	#include <asm/bug.h>
				30	#include <asm/cmpxchg.h>
				31	#include <asm/cpufeature.h>
				32	#include <asm/exception.h>
				33	#include <asm/daifflags.h>
				34	#include <asm/debug-monitors.h>
				35	#include <asm/esr.h>
				36	#include <asm/kasan.h>
				37	#include <asm/sysreg.h>
				38	#include <asm/system_misc.h>
				39	#include <asm/pgtable.h>
				40	#include <asm/tlbflush.h>
				41	#include <asm/traps.h>
				42
				43	struct fault_info {
				44	int (*fn)(unsigned long addr, unsigned int esr,
				45	struct pt_regs *regs);
				46	int sig;
				47	int code;
				48	const char *name;
				49	};
				50
				51	static const struct fault_info fault_info[];
				52	static struct fault_info debug_fault_info[];
				53
				54	static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
				55	{
				56	return fault_info + (esr & ESR_ELx_FSC);
				57	}
				58
				59	static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
				60	{
				61	return debug_fault_info + DBG_ESR_EVT(esr);
				62	}
				63
				64	static void data_abort_decode(unsigned int esr)
				65	{
				66	pr_alert("Data abort info:\n");
				67
				68	if (esr & ESR_ELx_ISV) {
				69	pr_alert(" Access size = %u byte(s)\n",
				70	1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
				71	pr_alert(" SSE = %lu, SRT = %lu\n",
				72	(esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
				73	(esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
				74	pr_alert(" SF = %lu, AR = %lu\n",
				75	(esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
				76	(esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
				77	} else {
				78	pr_alert(" ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK);
				79	}
				80
				81	pr_alert(" CM = %lu, WnR = %lu\n",
				82	(esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
				83	(esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT);
				84	}
				85
				86	static void mem_abort_decode(unsigned int esr)
				87	{
				88	pr_alert("Mem abort info:\n");
				89
				90	pr_alert(" ESR = 0x%08x\n", esr);
				91	pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
				92	ESR_ELx_EC(esr), esr_get_class_string(esr),
				93	(esr & ESR_ELx_IL) ? 32 : 16);
				94	pr_alert(" SET = %lu, FnV = %lu\n",
				95	(esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
				96	(esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
				97	pr_alert(" EA = %lu, S1PTW = %lu\n",
				98	(esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
				99	(esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
				100
				101	if (esr_is_data_abort(esr))
				102	data_abort_decode(esr);
				103	}
				104
				105	static inline bool is_ttbr0_addr(unsigned long addr)
				106	{
				107	/* entry assembly clears tags for TTBR0 addrs */
				108	return addr < TASK_SIZE;
				109	}
				110
				111	static inline bool is_ttbr1_addr(unsigned long addr)
				112	{
				113	/* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
				114	return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
				115	}
				116
				117	static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
				118	{
				119	/* Either init_pg_dir or swapper_pg_dir */
				120	if (mm == &init_mm)
				121	return __pa_symbol(mm->pgd);
				122
				123	return (unsigned long)virt_to_phys(mm->pgd);
				124	}
				125
				126	/*
				127	* Dump out the page tables associated with 'addr' in the currently active mm.
				128	*/
				129	static void show_pte(unsigned long addr)
				130	{
				131	struct mm_struct *mm;
				132	pgd_t *pgdp;
				133	pgd_t pgd;
				134
				135	if (is_ttbr0_addr(addr)) {
				136	/* TTBR0 */
				137	mm = current->active_mm;
				138	if (mm == &init_mm) {
				139	pr_alert("[%016lx] user address but active_mm is swapper\n",
				140	addr);
				141	return;
				142	}
				143	} else if (is_ttbr1_addr(addr)) {
				144	/* TTBR1 */
				145	mm = &init_mm;
				146	} else {
				147	pr_alert("[%016lx] address between user and kernel address ranges\n",
				148	addr);
				149	return;
				150	}
				151
				152	pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
				153	mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
				154	vabits_actual, mm_to_pgd_phys(mm));
				155	pgdp = pgd_offset(mm, addr);
				156	pgd = READ_ONCE(*pgdp);
				157	pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
				158
				159	do {
				160	pud_t *pudp, pud;
				161	pmd_t *pmdp, pmd;
				162	pte_t *ptep, pte;
				163
				164	if (pgd_none(pgd) \|\| pgd_bad(pgd))
				165	break;
				166
				167	pudp = pud_offset(pgdp, addr);
				168	pud = READ_ONCE(*pudp);
				169	pr_cont(", pud=%016llx", pud_val(pud));
				170	if (pud_none(pud) \|\| pud_bad(pud))
				171	break;
				172
				173	pmdp = pmd_offset(pudp, addr);
				174	pmd = READ_ONCE(*pmdp);
				175	pr_cont(", pmd=%016llx", pmd_val(pmd));
				176	if (pmd_none(pmd) \|\| pmd_bad(pmd))
				177	break;
				178
				179	ptep = pte_offset_map(pmdp, addr);
				180	pte = READ_ONCE(*ptep);
				181	pr_cont(", pte=%016llx", pte_val(pte));
				182	pte_unmap(ptep);
				183	} while(0);
				184
				185	pr_cont("\n");
				186	}
				187
				188	/*
				189	* This function sets the access flags (dirty, accessed), as well as write
				190	* permission, and only to a more permissive setting.
				191	*
				192	* It needs to cope with hardware update of the accessed/dirty state by other
				193	* agents in the system and can safely skip the __sync_icache_dcache() call as,
				194	* like set_pte_at(), the PTE is never changed from no-exec to exec here.
				195	*
				196	* Returns whether or not the PTE actually changed.
				197	*/
				198	int ptep_set_access_flags(struct vm_area_struct *vma,
				199	unsigned long address, pte_t *ptep,
				200	pte_t entry, int dirty)
				201	{
				202	pteval_t old_pteval, pteval;
				203	pte_t pte = READ_ONCE(*ptep);
				204
				205	if (pte_same(pte, entry))
				206	return 0;
				207
				208	/* only preserve the access flags and write permission */
				209	pte_val(entry) &= PTE_RDONLY \| PTE_AF \| PTE_WRITE \| PTE_DIRTY;
				210
				211	/*
				212	* Setting the flags must be done atomically to avoid racing with the
				213	* hardware update of the access/dirty state. The PTE_RDONLY bit must
				214	* be set to the most permissive (lowest value) of *ptep and entry
				215	* (calculated as: a & b == ~(~a \| ~b)).
				216	*/
				217	pte_val(entry) ^= PTE_RDONLY;
				218	pteval = pte_val(pte);
				219	do {
				220	old_pteval = pteval;
				221	pteval ^= PTE_RDONLY;
				222	pteval \|= pte_val(entry);
				223	pteval ^= PTE_RDONLY;
				224	pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
				225	} while (pteval != old_pteval);
				226
				227	flush_tlb_fix_spurious_fault(vma, address);
				228	return 1;
				229	}
				230
				231	static bool is_el1_instruction_abort(unsigned int esr)
				232	{
				233	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
				234	}
				235
				236	static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
				237	struct pt_regs *regs)
				238	{
				239	unsigned int ec = ESR_ELx_EC(esr);
				240	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
				241
				242	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
				243	return false;
				244
				245	if (fsc_type == ESR_ELx_FSC_PERM)
				246	return true;
				247
				248	if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
				249	return fsc_type == ESR_ELx_FSC_FAULT &&
				250	(regs->pstate & PSR_PAN_BIT);
				251
				252	return false;
				253	}
				254
				255	static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
				256	unsigned int esr,
				257	struct pt_regs *regs)
				258	{
				259	unsigned long flags;
				260	u64 par, dfsc;
				261
				262	if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR \|\|
				263	(esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
				264	return false;
				265
				266	local_irq_save(flags);
				267	asm volatile("at s1e1r, %0" :: "r" (addr));
				268	isb();
				269	par = read_sysreg(par_el1);
				270	local_irq_restore(flags);
				271
				272	/*
				273	* If we now have a valid translation, treat the translation fault as
				274	* spurious.
				275	*/
				276	if (!(par & SYS_PAR_EL1_F))
				277	return true;
				278
				279	/*
				280	* If we got a different type of fault from the AT instruction,
				281	* treat the translation fault as spurious.
				282	*/
				283	dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
				284	return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
				285	}
				286
				287	static void die_kernel_fault(const char *msg, unsigned long addr,
				288	unsigned int esr, struct pt_regs *regs)
				289	{
				290	bust_spinlocks(1);
				291
				292	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
				293	addr);
				294
				295	mem_abort_decode(esr);
				296
				297	show_pte(addr);
				298	die("Oops", regs, esr);
				299	bust_spinlocks(0);
				300	make_task_dead(SIGKILL);
				301	}
				302
				303	static void __do_kernel_fault(unsigned long addr, unsigned int esr,
				304	struct pt_regs *regs)
				305	{
				306	const char *msg;
				307
				308	/*
				309	* Are we prepared to handle this kernel fault?
				310	* We are almost certainly not prepared to handle instruction faults.
				311	*/
				312	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
				313	return;
				314
				315	if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
				316	"Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
				317	return;
				318
				319	if (is_el1_permission_fault(addr, esr, regs)) {
				320	if (esr & ESR_ELx_WNR)
				321	msg = "write to read-only memory";
				322	else
				323	msg = "read from unreadable memory";
				324	} else if (addr < PAGE_SIZE) {
				325	msg = "NULL pointer dereference";
				326	} else {
				327	if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
				328	return;
				329
				330	msg = "paging request";
				331	}
				332
				333	die_kernel_fault(msg, addr, esr, regs);
				334	}
				335
				336	static void set_thread_esr(unsigned long address, unsigned int esr)
				337	{
				338	current->thread.fault_address = address;
				339
				340	/*
				341	* If the faulting address is in the kernel, we must sanitize the ESR.
				342	* From userspace's point of view, kernel-only mappings don't exist
				343	* at all, so we report them as level 0 translation faults.
				344	* (This is not quite the way that "no mapping there at all" behaves:
				345	* an alignment fault not caused by the memory type would take
				346	* precedence over translation fault for a real access to empty
				347	* space. Unfortunately we can't easily distinguish "alignment fault
				348	* not caused by memory type" from "alignment fault caused by memory
				349	* type", so we ignore this wrinkle and just return the translation
				350	* fault.)
				351	*/
				352	if (!is_ttbr0_addr(current->thread.fault_address)) {
				353	switch (ESR_ELx_EC(esr)) {
				354	case ESR_ELx_EC_DABT_LOW:
				355	/*
				356	* These bits provide only information about the
				357	* faulting instruction, which userspace knows already.
				358	* We explicitly clear bits which are architecturally
				359	* RES0 in case they are given meanings in future.
				360	* We always report the ESR as if the fault was taken
				361	* to EL1 and so ISV and the bits in ISS[23:14] are
				362	* clear. (In fact it always will be a fault to EL1.)
				363	*/
				364	esr &= ESR_ELx_EC_MASK \| ESR_ELx_IL \|
				365	ESR_ELx_CM \| ESR_ELx_WNR;
				366	esr \|= ESR_ELx_FSC_FAULT;
				367	break;
				368	case ESR_ELx_EC_IABT_LOW:
				369	/*
				370	* Claim a level 0 translation fault.
				371	* All other bits are architecturally RES0 for faults
				372	* reported with that DFSC value, so we clear them.
				373	*/
				374	esr &= ESR_ELx_EC_MASK \| ESR_ELx_IL;
				375	esr \|= ESR_ELx_FSC_FAULT;
				376	break;
				377	default:
				378	/*
				379	* This should never happen (entry.S only brings us
				380	* into this code for insn and data aborts from a lower
				381	* exception level). Fail safe by not providing an ESR
				382	* context record at all.
				383	*/
				384	WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr);
				385	esr = 0;
				386	break;
				387	}
				388	}
				389
				390	current->thread.fault_code = esr;
				391	}
				392
				393	static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
				394	{
				395	/*
				396	* If we are in kernel mode at this point, we have no context to
				397	* handle this fault with.
				398	*/
				399	if (user_mode(regs)) {
				400	const struct fault_info *inf = esr_to_fault_info(esr);
				401
				402	set_thread_esr(addr, esr);
				403	arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
				404	inf->name);
				405	} else {
				406	__do_kernel_fault(addr, esr, regs);
				407	}
				408	}
				409
				410	#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
				411	#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
				412
				413	static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
				414	unsigned int mm_flags, unsigned long vm_flags)
				415	{
				416	struct vm_area_struct *vma = find_vma(mm, addr);
				417
				418	if (unlikely(!vma))
				419	return VM_FAULT_BADMAP;
				420
				421	/*
				422	* Ok, we have a good vm_area for this memory access, so we can handle
				423	* it.
				424	*/
				425	if (unlikely(vma->vm_start > addr)) {
				426	if (!(vma->vm_flags & VM_GROWSDOWN))
				427	return VM_FAULT_BADMAP;
				428	if (expand_stack(vma, addr))
				429	return VM_FAULT_BADMAP;
				430	}
				431
				432	/*
				433	* Check that the permissions on the VMA allow for the fault which
				434	* occurred.
				435	*/
				436	if (!(vma->vm_flags & vm_flags))
				437	return VM_FAULT_BADACCESS;
				438	return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
				439	}
				440
				441	static bool is_el0_instruction_abort(unsigned int esr)
				442	{
				443	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
				444	}
				445
				446	/*
				447	* Note: not valid for EL1 DC IVAC, but we never use that such that it
				448	* should fault. EL0 cannot issue DC IVAC (undef).
				449	*/
				450	static bool is_write_abort(unsigned int esr)
				451	{
				452	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
				453	}
				454
				455	static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
				456	struct pt_regs *regs)
				457	{
				458	const struct fault_info *inf;
				459	struct mm_struct *mm = current->mm;
				460	vm_fault_t fault, major = 0;
				461	unsigned long vm_flags = VM_READ \| VM_WRITE \| VM_EXEC;
				462	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
				463
				464	if (kprobe_page_fault(regs, esr))
				465	return 0;
				466
				467	/*
				468	* If we're in an interrupt or have no user context, we must not take
				469	* the fault.
				470	*/
				471	if (faulthandler_disabled() \|\| !mm)
				472	goto no_context;
				473
				474	if (user_mode(regs))
				475	mm_flags \|= FAULT_FLAG_USER;
				476
				477	if (is_el0_instruction_abort(esr)) {
				478	vm_flags = VM_EXEC;
				479	mm_flags \|= FAULT_FLAG_INSTRUCTION;
				480	} else if (is_write_abort(esr)) {
				481	vm_flags = VM_WRITE;
				482	mm_flags \|= FAULT_FLAG_WRITE;
				483	}
				484
				485	if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
				486	/* regs->orig_addr_limit may be 0 if we entered from EL0 */
				487	if (regs->orig_addr_limit == KERNEL_DS)
				488	die_kernel_fault("access to user memory with fs=KERNEL_DS",
				489	addr, esr, regs);
				490
				491	if (is_el1_instruction_abort(esr))
				492	die_kernel_fault("execution of user memory",
				493	addr, esr, regs);
				494
				495	if (!search_exception_tables(regs->pc))
				496	die_kernel_fault("access to user memory outside uaccess routines",
				497	addr, esr, regs);
				498	}
				499
				500	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
				501
				502	/*
				503	* As per x86, we may deadlock here. However, since the kernel only
				504	* validly references user space from well defined areas of the code,
				505	* we can bug out early if this is from code which shouldn't.
				506	*/
				507	if (!down_read_trylock(&mm->mmap_sem)) {
				508	if (!user_mode(regs) && !search_exception_tables(regs->pc))
				509	goto no_context;
				510	retry:
				511	down_read(&mm->mmap_sem);
				512	} else {
				513	/*
				514	* The above down_read_trylock() might have succeeded in which
				515	* case, we'll have missed the might_sleep() from down_read().
				516	*/
				517	might_sleep();
				518	#ifdef CONFIG_DEBUG_VM
				519	if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
				520	up_read(&mm->mmap_sem);
				521	goto no_context;
				522	}
				523	#endif
				524	}
				525
				526	fault = __do_page_fault(mm, addr, mm_flags, vm_flags);
				527	major \|= fault & VM_FAULT_MAJOR;
				528
				529	/* Quick path to respond to signals */
				530	if (fault_signal_pending(fault, regs)) {
				531	if (!user_mode(regs))
				532	goto no_context;
				533	return 0;
				534	}
				535
				536	if (fault & VM_FAULT_RETRY) {
				537	if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
				538	mm_flags \|= FAULT_FLAG_TRIED;
				539	goto retry;
				540	}
				541	}
				542	up_read(&mm->mmap_sem);
				543
				544	/*
				545	* Handle the "normal" (no error) case first.
				546	*/
				547	if (likely(!(fault & (VM_FAULT_ERROR \| VM_FAULT_BADMAP \|
				548	VM_FAULT_BADACCESS)))) {
				549	/*
				550	* Major/minor page fault accounting is only done
				551	* once. If we go through a retry, it is extremely
				552	* likely that the page will be found in page cache at
				553	* that point.
				554	*/
				555	if (major) {
				556	current->maj_flt++;
				557	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
				558	addr);
				559	} else {
				560	current->min_flt++;
				561	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
				562	addr);
				563	}
				564
				565	return 0;
				566	}
				567
				568	/*
				569	* If we are in kernel mode at this point, we have no context to
				570	* handle this fault with.
				571	*/
				572	if (!user_mode(regs))
				573	goto no_context;
				574
				575	if (fault & VM_FAULT_OOM) {
				576	/*
				577	* We ran out of memory, call the OOM killer, and return to
				578	* userspace (which will retry the fault, or kill us if we got
				579	* oom-killed).
				580	*/
				581	pagefault_out_of_memory();
				582	return 0;
				583	}
				584
				585	inf = esr_to_fault_info(esr);
				586	set_thread_esr(addr, esr);
				587	if (fault & VM_FAULT_SIGBUS) {
				588	/*
				589	* We had some memory, but were unable to successfully fix up
				590	* this page fault.
				591	*/
				592	arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,
				593	inf->name);
				594	} else if (fault & (VM_FAULT_HWPOISON_LARGE \| VM_FAULT_HWPOISON)) {
				595	unsigned int lsb;
				596
				597	lsb = PAGE_SHIFT;
				598	if (fault & VM_FAULT_HWPOISON_LARGE)
				599	lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
				600
				601	arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,
				602	inf->name);
				603	} else {
				604	/*
				605	* Something tried to access memory that isn't in our memory
				606	* map.
				607	*/
				608	arm64_force_sig_fault(SIGSEGV,
				609	fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
				610	(void __user *)addr,
				611	inf->name);
				612	}
				613
				614	return 0;
				615
				616	no_context:
				617	__do_kernel_fault(addr, esr, regs);
				618	return 0;
				619	}
				620
				621	static int __kprobes do_translation_fault(unsigned long addr,
				622	unsigned int esr,
				623	struct pt_regs *regs)
				624	{
				625	if (is_ttbr0_addr(addr))
				626	return do_page_fault(addr, esr, regs);
				627
				628	do_bad_area(addr, esr, regs);
				629	return 0;
				630	}
				631
				632	static int do_alignment_fault(unsigned long addr, unsigned int esr,
				633	struct pt_regs *regs)
				634	{
				635	do_bad_area(addr, esr, regs);
				636	return 0;
				637	}
				638
				639	static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
				640	{
				641	return 1; /* "fault" */
				642	}
				643
				644	static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
				645	{
				646	const struct fault_info *inf;
				647	void __user *siaddr;
				648
				649	inf = esr_to_fault_info(esr);
				650
				651	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
				652	/*
				653	* APEI claimed this as a firmware-first notification.
				654	* Some processing deferred to task_work before ret_to_user().
				655	*/
				656	return 0;
				657	}
				658
				659	if (esr & ESR_ELx_FnV)
				660	siaddr = NULL;
				661	else
				662	siaddr = (void __user *)addr;
				663	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
				664
				665	return 0;
				666	}
				667
				668	static const struct fault_info fault_info[] = {
				669	{ do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
				670	{ do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
				671	{ do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" },
				672	{ do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" },
				673	{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
				674	{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
				675	{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
				676	{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
				677	{ do_bad, SIGKILL, SI_KERNEL, "unknown 8" },
				678	{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
				679	{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
				680	{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
				681	{ do_bad, SIGKILL, SI_KERNEL, "unknown 12" },
				682	{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
				683	{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
				684	{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
				685	{ do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
				686	{ do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
				687	{ do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
				688	{ do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
				689	{ do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
				690	{ do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" },
				691	{ do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" },
				692	{ do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" },
				693	{ do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
				694	{ do_bad, SIGKILL, SI_KERNEL, "unknown 25" },
				695	{ do_bad, SIGKILL, SI_KERNEL, "unknown 26" },
				696	{ do_bad, SIGKILL, SI_KERNEL, "unknown 27" },
				697	{ do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
				698	{ do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
				699	{ do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
				700	{ do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
				701	{ do_bad, SIGKILL, SI_KERNEL, "unknown 32" },
				702	{ do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
				703	{ do_bad, SIGKILL, SI_KERNEL, "unknown 34" },
				704	{ do_bad, SIGKILL, SI_KERNEL, "unknown 35" },
				705	{ do_bad, SIGKILL, SI_KERNEL, "unknown 36" },
				706	{ do_bad, SIGKILL, SI_KERNEL, "unknown 37" },
				707	{ do_bad, SIGKILL, SI_KERNEL, "unknown 38" },
				708	{ do_bad, SIGKILL, SI_KERNEL, "unknown 39" },
				709	{ do_bad, SIGKILL, SI_KERNEL, "unknown 40" },
				710	{ do_bad, SIGKILL, SI_KERNEL, "unknown 41" },
				711	{ do_bad, SIGKILL, SI_KERNEL, "unknown 42" },
				712	{ do_bad, SIGKILL, SI_KERNEL, "unknown 43" },
				713	{ do_bad, SIGKILL, SI_KERNEL, "unknown 44" },
				714	{ do_bad, SIGKILL, SI_KERNEL, "unknown 45" },
				715	{ do_bad, SIGKILL, SI_KERNEL, "unknown 46" },
				716	{ do_bad, SIGKILL, SI_KERNEL, "unknown 47" },
				717	{ do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" },
				718	{ do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" },
				719	{ do_bad, SIGKILL, SI_KERNEL, "unknown 50" },
				720	{ do_bad, SIGKILL, SI_KERNEL, "unknown 51" },
				721	{ do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" },
				722	{ do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" },
				723	{ do_bad, SIGKILL, SI_KERNEL, "unknown 54" },
				724	{ do_bad, SIGKILL, SI_KERNEL, "unknown 55" },
				725	{ do_bad, SIGKILL, SI_KERNEL, "unknown 56" },
				726	{ do_bad, SIGKILL, SI_KERNEL, "unknown 57" },
				727	{ do_bad, SIGKILL, SI_KERNEL, "unknown 58" },
				728	{ do_bad, SIGKILL, SI_KERNEL, "unknown 59" },
				729	{ do_bad, SIGKILL, SI_KERNEL, "unknown 60" },
				730	{ do_bad, SIGKILL, SI_KERNEL, "section domain fault" },
				731	{ do_bad, SIGKILL, SI_KERNEL, "page domain fault" },
				732	{ do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
				733	};
				734
				735	asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
				736	struct pt_regs *regs)
				737	{
				738	const struct fault_info *inf = esr_to_fault_info(esr);
				739
				740	if (!inf->fn(addr, esr, regs))
				741	return;
				742
				743	if (!user_mode(regs)) {
				744	pr_alert("Unhandled fault at 0x%016lx\n", addr);
				745	mem_abort_decode(esr);
				746	show_pte(addr);
				747	}
				748
				749	arm64_notify_die(inf->name, regs,
				750	inf->sig, inf->code, (void __user *)addr, esr);
				751	}
				752
				753	asmlinkage void __exception do_el0_irq_bp_hardening(void)
				754	{
				755	/* PC has already been checked in entry.S */
				756	arm64_apply_bp_hardening();
				757	}
				758
				759	asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
				760	unsigned int esr,
				761	struct pt_regs *regs)
				762	{
				763	/*
				764	* We've taken an instruction abort from userspace and not yet
				765	* re-enabled IRQs. If the address is a kernel address, apply
				766	* BP hardening prior to enabling IRQs and pre-emption.
				767	*/
				768	if (!is_ttbr0_addr(addr))
				769	arm64_apply_bp_hardening();
				770
				771	local_daif_restore(DAIF_PROCCTX);
				772	do_mem_abort(addr, esr, regs);
				773	}
				774
				775
				776	asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
				777	unsigned int esr,
				778	struct pt_regs *regs)
				779	{
				780	if (user_mode(regs)) {
				781	if (!is_ttbr0_addr(instruction_pointer(regs)))
				782	arm64_apply_bp_hardening();
				783	local_daif_restore(DAIF_PROCCTX);
				784	}
				785
				786	arm64_notify_die("SP/PC alignment exception", regs,
				787	SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
				788	}
				789
				790	int __init early_brk64(unsigned long addr, unsigned int esr,
				791	struct pt_regs *regs);
				792
				793	/*
				794	* __refdata because early_brk64 is __init, but the reference to it is
				795	* clobbered at arch_initcall time.
				796	* See traps.c and debug-monitors.c:debug_traps_init().
				797	*/
				798	static struct fault_info __refdata debug_fault_info[] = {
				799	{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
				800	{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
				801	{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
				802	{ do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
				803	{ do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
				804	{ do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
				805	{ early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
				806	{ do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
				807	};
				808
				809	void __init hook_debug_fault_code(int nr,
				810	int (fn)(unsigned long, unsigned int, struct pt_regs ),
				811	int sig, int code, const char *name)
				812	{
				813	BUG_ON(nr < 0 \|\| nr >= ARRAY_SIZE(debug_fault_info));
				814
				815	debug_fault_info[nr].fn = fn;
				816	debug_fault_info[nr].sig = sig;
				817	debug_fault_info[nr].code = code;
				818	debug_fault_info[nr].name = name;
				819	}
				820
				821	/*
				822	* In debug exception context, we explicitly disable preemption despite
				823	* having interrupts disabled.
				824	* This serves two purposes: it makes it much less likely that we would
				825	* accidentally schedule in exception context and it will force a warning
				826	* if we somehow manage to schedule by accident.
				827	*/
				828	static void debug_exception_enter(struct pt_regs *regs)
				829	{
				830	/*
				831	* Tell lockdep we disabled irqs in entry.S. Do nothing if they were
				832	* already disabled to preserve the last enabled/disabled addresses.
				833	*/
				834	if (interrupts_enabled(regs))
				835	trace_hardirqs_off();
				836
				837	if (user_mode(regs)) {
				838	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
				839	} else {
				840	/*
				841	* We might have interrupted pretty much anything. In
				842	* fact, if we're a debug exception, we can even interrupt
				843	* NMI processing. We don't want this code makes in_nmi()
				844	* to return true, but we need to notify RCU.
				845	*/
				846	rcu_nmi_enter();
				847	}
				848
				849	preempt_disable();
				850
				851	/* This code is a bit fragile. Test it. */
				852	RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
				853	}
				854	NOKPROBE_SYMBOL(debug_exception_enter);
				855
				856	static void debug_exception_exit(struct pt_regs *regs)
				857	{
				858	preempt_enable_no_resched();
				859
				860	if (!user_mode(regs))
				861	rcu_nmi_exit();
				862
				863	if (interrupts_enabled(regs))
				864	trace_hardirqs_on();
				865	}
				866	NOKPROBE_SYMBOL(debug_exception_exit);
				867
				868	#ifdef CONFIG_ARM64_ERRATUM_1463225
				869	DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
				870
				871	static int __exception
				872	cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
				873	{
				874	if (user_mode(regs))
				875	return 0;
				876
				877	if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa))
				878	return 0;
				879
				880	/*
				881	* We've taken a dummy step exception from the kernel to ensure
				882	* that interrupts are re-enabled on the syscall path. Return back
				883	* to cortex_a76_erratum_1463225_svc_handler() with debug exceptions
				884	* masked so that we can safely restore the mdscr and get on with
				885	* handling the syscall.
				886	*/
				887	regs->pstate \|= PSR_D_BIT;
				888	return 1;
				889	}
				890	#else
				891	static int __exception
				892	cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
				893	{
				894	return 0;
				895	}
				896	#endif /* CONFIG_ARM64_ERRATUM_1463225 */
				897
				898	asmlinkage void __exception do_debug_exception(unsigned long addr_if_watchpoint,
				899	unsigned int esr,
				900	struct pt_regs *regs)
				901	{
				902	const struct fault_info *inf = esr_to_debug_fault_info(esr);
				903	unsigned long pc = instruction_pointer(regs);
				904
				905	if (cortex_a76_erratum_1463225_debug_handler(regs))
				906	return;
				907
				908	debug_exception_enter(regs);
				909
				910	if (user_mode(regs) && !is_ttbr0_addr(pc))
				911	arm64_apply_bp_hardening();
				912
				913	if (inf->fn(addr_if_watchpoint, esr, regs)) {
				914	arm64_notify_die(inf->name, regs,
				915	inf->sig, inf->code, (void __user *)pc, esr);
				916	}
				917
				918	debug_exception_exit(regs);
				919	}
				920	NOKPROBE_SYMBOL(do_debug_exception);