Blame - marvell/linux/arch/arm/net/bpf_jit_32.c - T108

blob: 1c6e57f1dbc488a44c2d993bd32c845dce4816d7 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Just-In-Time compiler for eBPF filters on 32bit ARM
				4	*
				5	* Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com>
				6	* Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
				7	*/
				8
				9	#include <linux/bpf.h>
				10	#include <linux/bitops.h>
				11	#include <linux/compiler.h>
				12	#include <linux/errno.h>
				13	#include <linux/filter.h>
				14	#include <linux/netdevice.h>
				15	#include <linux/string.h>
				16	#include <linux/slab.h>
				17	#include <linux/if_vlan.h>
				18
				19	#include <asm/cacheflush.h>
				20	#include <asm/hwcap.h>
				21	#include <asm/opcodes.h>
				22	#include <asm/system_info.h>
				23
				24	#include "bpf_jit_32.h"
				25
				26	/*
				27	* eBPF prog stack layout:
				28	*
				29	* high
				30	* original ARM_SP => +-----+
				31	* \| \| callee saved registers
				32	* +-----+ <= (BPF_FP + SCRATCH_SIZE)
				33	* \| ... \| eBPF JIT scratch space
				34	* eBPF fp register => +-----+
				35	* (BPF_FP) \| ... \| eBPF prog stack
				36	* +-----+
				37	* \|RSVD \| JIT scratchpad
				38	* current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE)
				39	* \| ... \| caller-saved registers
				40	* +-----+
				41	* \| ... \| arguments passed on stack
				42	* ARM_SP during call => +-----\|
				43	* \| \|
				44	* \| ... \| Function call stack
				45	* \| \|
				46	* +-----+
				47	* low
				48	*
				49	* The callee saved registers depends on whether frame pointers are enabled.
				50	* With frame pointers (to be compliant with the ABI):
				51	*
				52	* high
				53	* original ARM_SP => +--------------+ \
				54	* \| pc \| \|
				55	* current ARM_FP => +--------------+ } callee saved registers
				56	* \|r4-r9,fp,ip,lr\| \|
				57	* +--------------+ /
				58	* low
				59	*
				60	* Without frame pointers:
				61	*
				62	* high
				63	* original ARM_SP => +--------------+
				64	* \| r4-r9,fp,lr \| callee saved registers
				65	* current ARM_FP => +--------------+
				66	* low
				67	*
				68	* When popping registers off the stack at the end of a BPF function, we
				69	* reference them via the current ARM_FP register.
				70	*
				71	* Some eBPF operations are implemented via a call to a helper function.
				72	* Such calls are "invisible" in the eBPF code, so it is up to the calling
				73	* program to preserve any caller-saved ARM registers during the call. The
				74	* JIT emits code to push and pop those registers onto the stack, immediately
				75	* above the callee stack frame.
				76	*/
				77	#define CALLEE_MASK (1 << ARM_R4 \| 1 << ARM_R5 \| 1 << ARM_R6 \| \
				78	1 << ARM_R7 \| 1 << ARM_R8 \| 1 << ARM_R9 \| \
				79	1 << ARM_FP)
				80	#define CALLEE_PUSH_MASK (CALLEE_MASK \| 1 << ARM_LR)
				81	#define CALLEE_POP_MASK (CALLEE_MASK \| 1 << ARM_PC)
				82
				83	#define CALLER_MASK (1 << ARM_R0 \| 1 << ARM_R1 \| 1 << ARM_R2 \| 1 << ARM_R3)
				84
				85	enum {
				86	/* Stack layout - these are offsets from (top of stack - 4) */
				87	BPF_R2_HI,
				88	BPF_R2_LO,
				89	BPF_R3_HI,
				90	BPF_R3_LO,
				91	BPF_R4_HI,
				92	BPF_R4_LO,
				93	BPF_R5_HI,
				94	BPF_R5_LO,
				95	BPF_R7_HI,
				96	BPF_R7_LO,
				97	BPF_R8_HI,
				98	BPF_R8_LO,
				99	BPF_R9_HI,
				100	BPF_R9_LO,
				101	BPF_FP_HI,
				102	BPF_FP_LO,
				103	BPF_TC_HI,
				104	BPF_TC_LO,
				105	BPF_AX_HI,
				106	BPF_AX_LO,
				107	/* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4,
				108	* BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9,
				109	* BPF_REG_FP and Tail call counts.
				110	*/
				111	BPF_JIT_SCRATCH_REGS,
				112	};
				113
				114	/*
				115	* Negative "register" values indicate the register is stored on the stack
				116	* and are the offset from the top of the eBPF JIT scratch space.
				117	*/
				118	#define STACK_OFFSET(k) (-4 - (k) * 4)
				119	#define SCRATCH_SIZE (BPF_JIT_SCRATCH_REGS * 4)
				120
				121	#ifdef CONFIG_FRAME_POINTER
				122	#define EBPF_SCRATCH_TO_ARM_FP(x) ((x) - 4 * hweight16(CALLEE_PUSH_MASK) - 4)
				123	#else
				124	#define EBPF_SCRATCH_TO_ARM_FP(x) (x)
				125	#endif
				126
				127	#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */
				128	#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */
				129	#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */
				130
				131	#define FLAG_IMM_OVERFLOW (1 << 0)
				132
				133	/*
				134	* Map eBPF registers to ARM 32bit registers or stack scratch space.
				135	*
				136	* 1. First argument is passed using the arm 32bit registers and rest of the
				137	* arguments are passed on stack scratch space.
				138	* 2. First callee-saved argument is mapped to arm 32 bit registers and rest
				139	* arguments are mapped to scratch space on stack.
				140	* 3. We need two 64 bit temp registers to do complex operations on eBPF
				141	* registers.
				142	*
				143	* As the eBPF registers are all 64 bit registers and arm has only 32 bit
				144	* registers, we have to map each eBPF registers with two arm 32 bit regs or
				145	* scratch memory space and we have to build eBPF 64 bit register from those.
				146	*
				147	*/
				148	static const s8 bpf2a32[][2] = {
				149	/* return value from in-kernel function, and exit value from eBPF */
				150	[BPF_REG_0] = {ARM_R1, ARM_R0},
				151	/* arguments from eBPF program to in-kernel function */
				152	[BPF_REG_1] = {ARM_R3, ARM_R2},
				153	/* Stored on stack scratch space */
				154	[BPF_REG_2] = {STACK_OFFSET(BPF_R2_HI), STACK_OFFSET(BPF_R2_LO)},
				155	[BPF_REG_3] = {STACK_OFFSET(BPF_R3_HI), STACK_OFFSET(BPF_R3_LO)},
				156	[BPF_REG_4] = {STACK_OFFSET(BPF_R4_HI), STACK_OFFSET(BPF_R4_LO)},
				157	[BPF_REG_5] = {STACK_OFFSET(BPF_R5_HI), STACK_OFFSET(BPF_R5_LO)},
				158	/* callee saved registers that in-kernel function will preserve */
				159	[BPF_REG_6] = {ARM_R5, ARM_R4},
				160	/* Stored on stack scratch space */
				161	[BPF_REG_7] = {STACK_OFFSET(BPF_R7_HI), STACK_OFFSET(BPF_R7_LO)},
				162	[BPF_REG_8] = {STACK_OFFSET(BPF_R8_HI), STACK_OFFSET(BPF_R8_LO)},
				163	[BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)},
				164	/* Read only Frame Pointer to access Stack */
				165	[BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)},
				166	/* Temporary Register for internal BPF JIT, can be used
				167	* for constant blindings and others.
				168	*/
				169	[TMP_REG_1] = {ARM_R7, ARM_R6},
				170	[TMP_REG_2] = {ARM_R9, ARM_R8},
				171	/* Tail call count. Stored on stack scratch space. */
				172	[TCALL_CNT] = {STACK_OFFSET(BPF_TC_HI), STACK_OFFSET(BPF_TC_LO)},
				173	/* temporary register for blinding constants.
				174	* Stored on stack scratch space.
				175	*/
				176	[BPF_REG_AX] = {STACK_OFFSET(BPF_AX_HI), STACK_OFFSET(BPF_AX_LO)},
				177	};
				178
				179	#define dst_lo dst[1]
				180	#define dst_hi dst[0]
				181	#define src_lo src[1]
				182	#define src_hi src[0]
				183
				184	/*
				185	* JIT Context:
				186	*
				187	* prog : bpf_prog
				188	* idx : index of current last JITed instruction.
				189	* prologue_bytes : bytes used in prologue.
				190	* epilogue_offset : offset of epilogue starting.
				191	* offsets : array of eBPF instruction offsets in
				192	* JITed code.
				193	* target : final JITed code.
				194	* epilogue_bytes : no of bytes used in epilogue.
				195	* imm_count : no of immediate counts used for global
				196	* variables.
				197	* imms : array of global variable addresses.
				198	*/
				199
				200	struct jit_ctx {
				201	const struct bpf_prog *prog;
				202	unsigned int idx;
				203	unsigned int prologue_bytes;
				204	unsigned int epilogue_offset;
				205	unsigned int cpu_architecture;
				206	u32 flags;
				207	u32 *offsets;
				208	u32 *target;
				209	u32 stack_size;
				210	#if __LINUX_ARM_ARCH__ < 7
				211	u16 epilogue_bytes;
				212	u16 imm_count;
				213	u32 *imms;
				214	#endif
				215	};
				216
				217	/*
				218	* Wrappers which handle both OABI and EABI and assures Thumb2 interworking
				219	* (where the assembly routines like __aeabi_uidiv could cause problems).
				220	*/
				221	static u32 jit_udiv32(u32 dividend, u32 divisor)
				222	{
				223	return dividend / divisor;
				224	}
				225
				226	static u32 jit_mod32(u32 dividend, u32 divisor)
				227	{
				228	return dividend % divisor;
				229	}
				230
				231	static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
				232	{
				233	inst \|= (cond << 28);
				234	inst = __opcode_to_mem_arm(inst);
				235
				236	if (ctx->target != NULL)
				237	ctx->target[ctx->idx] = inst;
				238
				239	ctx->idx++;
				240	}
				241
				242	/*
				243	* Emit an instruction that will be executed unconditionally.
				244	*/
				245	static inline void emit(u32 inst, struct jit_ctx *ctx)
				246	{
				247	_emit(ARM_COND_AL, inst, ctx);
				248	}
				249
				250	/*
				251	* This is rather horrid, but necessary to convert an integer constant
				252	* to an immediate operand for the opcodes, and be able to detect at
				253	* build time whether the constant can't be converted (iow, usable in
				254	* BUILD_BUG_ON()).
				255	*/
				256	#define imm12val(v, s) (rol32(v, (s)) \| (s) << 7)
				257	#define const_imm8m(x) \
				258	({ int r; \
				259	u32 v = (x); \
				260	if (!(v & ~0x000000ff)) \
				261	r = imm12val(v, 0); \
				262	else if (!(v & ~0xc000003f)) \
				263	r = imm12val(v, 2); \
				264	else if (!(v & ~0xf000000f)) \
				265	r = imm12val(v, 4); \
				266	else if (!(v & ~0xfc000003)) \
				267	r = imm12val(v, 6); \
				268	else if (!(v & ~0xff000000)) \
				269	r = imm12val(v, 8); \
				270	else if (!(v & ~0x3fc00000)) \
				271	r = imm12val(v, 10); \
				272	else if (!(v & ~0x0ff00000)) \
				273	r = imm12val(v, 12); \
				274	else if (!(v & ~0x03fc0000)) \
				275	r = imm12val(v, 14); \
				276	else if (!(v & ~0x00ff0000)) \
				277	r = imm12val(v, 16); \
				278	else if (!(v & ~0x003fc000)) \
				279	r = imm12val(v, 18); \
				280	else if (!(v & ~0x000ff000)) \
				281	r = imm12val(v, 20); \
				282	else if (!(v & ~0x0003fc00)) \
				283	r = imm12val(v, 22); \
				284	else if (!(v & ~0x0000ff00)) \
				285	r = imm12val(v, 24); \
				286	else if (!(v & ~0x00003fc0)) \
				287	r = imm12val(v, 26); \
				288	else if (!(v & ~0x00000ff0)) \
				289	r = imm12val(v, 28); \
				290	else if (!(v & ~0x000003fc)) \
				291	r = imm12val(v, 30); \
				292	else \
				293	r = -1; \
				294	r; })
				295
				296	/*
				297	* Checks if immediate value can be converted to imm12(12 bits) value.
				298	*/
				299	static int imm8m(u32 x)
				300	{
				301	u32 rot;
				302
				303	for (rot = 0; rot < 16; rot++)
				304	if ((x & ~ror32(0xff, 2 * rot)) == 0)
				305	return rol32(x, 2 * rot) \| (rot << 8);
				306	return -1;
				307	}
				308
				309	#define imm8m(x) (__builtin_constant_p(x) ? const_imm8m(x) : imm8m(x))
				310
				311	static u32 arm_bpf_ldst_imm12(u32 op, u8 rt, u8 rn, s16 imm12)
				312	{
				313	op \|= rt << 12 \| rn << 16;
				314	if (imm12 >= 0)
				315	op \|= ARM_INST_LDST__U;
				316	else
				317	imm12 = -imm12;
				318	return op \| (imm12 & ARM_INST_LDST__IMM12);
				319	}
				320
				321	static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8)
				322	{
				323	op \|= rt << 12 \| rn << 16;
				324	if (imm8 >= 0)
				325	op \|= ARM_INST_LDST__U;
				326	else
				327	imm8 = -imm8;
				328	return op \| (imm8 & 0xf0) << 4 \| (imm8 & 0x0f);
				329	}
				330
				331	#define ARM_LDR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDR_I, rt, rn, off)
				332	#define ARM_LDRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDRB_I, rt, rn, off)
				333	#define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off)
				334	#define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off)
				335
				336	#define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off)
				337	#define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off)
				338	#define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off)
				339	#define ARM_STRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRH_I, rt, rn, off)
				340
				341	/*
				342	* Initializes the JIT space with undefined instructions.
				343	*/
				344	static void jit_fill_hole(void *area, unsigned int size)
				345	{
				346	u32 *ptr;
				347	/* We are guaranteed to have aligned memory. */
				348	for (ptr = area; size >= sizeof(u32); size -= sizeof(u32))
				349	*ptr++ = __opcode_to_mem_arm(ARM_INST_UDF);
				350	}
				351
				352	#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
				353	/* EABI requires the stack to be aligned to 64-bit boundaries */
				354	#define STACK_ALIGNMENT 8
				355	#else
				356	/* Stack must be aligned to 32-bit boundaries */
				357	#define STACK_ALIGNMENT 4
				358	#endif
				359
				360	/* total stack size used in JITed code */
				361	#define _STACK_SIZE (ctx->prog->aux->stack_depth + SCRATCH_SIZE)
				362	#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
				363
				364	#if __LINUX_ARM_ARCH__ < 7
				365
				366	static u16 imm_offset(u32 k, struct jit_ctx *ctx)
				367	{
				368	unsigned int i = 0, offset;
				369	u16 imm;
				370
				371	/* on the "fake" run we just count them (duplicates included) */
				372	if (ctx->target == NULL) {
				373	ctx->imm_count++;
				374	return 0;
				375	}
				376
				377	while ((i < ctx->imm_count) && ctx->imms[i]) {
				378	if (ctx->imms[i] == k)
				379	break;
				380	i++;
				381	}
				382
				383	if (ctx->imms[i] == 0)
				384	ctx->imms[i] = k;
				385
				386	/* constants go just after the epilogue */
				387	offset = ctx->offsets[ctx->prog->len - 1] * 4;
				388	offset += ctx->prologue_bytes;
				389	offset += ctx->epilogue_bytes;
				390	offset += i * 4;
				391
				392	ctx->target[offset / 4] = k;
				393
				394	/* PC in ARM mode == address of the instruction + 8 */
				395	imm = offset - (8 + ctx->idx * 4);
				396
				397	if (imm & ~0xfff) {
				398	/*
				399	* literal pool is too far, signal it into flags. we
				400	* can only detect it on the second pass unfortunately.
				401	*/
				402	ctx->flags \|= FLAG_IMM_OVERFLOW;
				403	return 0;
				404	}
				405
				406	return imm;
				407	}
				408
				409	#endif /* __LINUX_ARM_ARCH__ */
				410
				411	static inline int bpf2a32_offset(int bpf_to, int bpf_from,
				412	const struct jit_ctx *ctx) {
				413	int to, from;
				414
				415	if (ctx->target == NULL)
				416	return 0;
				417	to = ctx->offsets[bpf_to];
				418	from = ctx->offsets[bpf_from];
				419
				420	return to - from - 1;
				421	}
				422
				423	/*
				424	* Move an immediate that's not an imm8m to a core register.
				425	*/
				426	static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx)
				427	{
				428	#if __LINUX_ARM_ARCH__ < 7
				429	emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
				430	#else
				431	emit(ARM_MOVW(rd, val & 0xffff), ctx);
				432	if (val > 0xffff)
				433	emit(ARM_MOVT(rd, val >> 16), ctx);
				434	#endif
				435	}
				436
				437	static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx)
				438	{
				439	int imm12 = imm8m(val);
				440
				441	if (imm12 >= 0)
				442	emit(ARM_MOV_I(rd, imm12), ctx);
				443	else
				444	emit_mov_i_no8m(rd, val, ctx);
				445	}
				446
				447	static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx)
				448	{
				449	if (elf_hwcap & HWCAP_THUMB)
				450	emit(ARM_BX(tgt_reg), ctx);
				451	else
				452	emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
				453	}
				454
				455	static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
				456	{
				457	#if __LINUX_ARM_ARCH__ < 5
				458	emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
				459	emit_bx_r(tgt_reg, ctx);
				460	#else
				461	emit(ARM_BLX_R(tgt_reg), ctx);
				462	#endif
				463	}
				464
				465	static inline int epilogue_offset(const struct jit_ctx *ctx)
				466	{
				467	int to, from;
				468	/* No need for 1st dummy run */
				469	if (ctx->target == NULL)
				470	return 0;
				471	to = ctx->epilogue_offset;
				472	from = ctx->idx;
				473
				474	return to - from - 2;
				475	}
				476
				477	static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op)
				478	{
				479	const int exclude_mask = BIT(ARM_R0) \| BIT(ARM_R1);
				480	const s8 *tmp = bpf2a32[TMP_REG_1];
				481
				482	#if __LINUX_ARM_ARCH__ == 7
				483	if (elf_hwcap & HWCAP_IDIVA) {
				484	if (op == BPF_DIV)
				485	emit(ARM_UDIV(rd, rm, rn), ctx);
				486	else {
				487	emit(ARM_UDIV(ARM_IP, rm, rn), ctx);
				488	emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx);
				489	}
				490	return;
				491	}
				492	#endif
				493
				494	/*
				495	* For BPF_ALU \| BPF_DIV \| BPF_K instructions
				496	* As ARM_R1 and ARM_R0 contains 1st argument of bpf
				497	* function, we need to save it on caller side to save
				498	* it from getting destroyed within callee.
				499	* After the return from the callee, we restore ARM_R0
				500	* ARM_R1.
				501	*/
				502	if (rn != ARM_R1) {
				503	emit(ARM_MOV_R(tmp[0], ARM_R1), ctx);
				504	emit(ARM_MOV_R(ARM_R1, rn), ctx);
				505	}
				506	if (rm != ARM_R0) {
				507	emit(ARM_MOV_R(tmp[1], ARM_R0), ctx);
				508	emit(ARM_MOV_R(ARM_R0, rm), ctx);
				509	}
				510
				511	/* Push caller-saved registers on stack */
				512	emit(ARM_PUSH(CALLER_MASK & ~exclude_mask), ctx);
				513
				514	/* Call appropriate function */
				515	emit_mov_i(ARM_IP, op == BPF_DIV ?
				516	(u32)jit_udiv32 : (u32)jit_mod32, ctx);
				517	emit_blx_r(ARM_IP, ctx);
				518
				519	/* Restore caller-saved registers from stack */
				520	emit(ARM_POP(CALLER_MASK & ~exclude_mask), ctx);
				521
				522	/* Save return value */
				523	if (rd != ARM_R0)
				524	emit(ARM_MOV_R(rd, ARM_R0), ctx);
				525
				526	/* Restore ARM_R0 and ARM_R1 */
				527	if (rn != ARM_R1)
				528	emit(ARM_MOV_R(ARM_R1, tmp[0]), ctx);
				529	if (rm != ARM_R0)
				530	emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx);
				531	}
				532
				533	/* Is the translated BPF register on stack? */
				534	static bool is_stacked(s8 reg)
				535	{
				536	return reg < 0;
				537	}
				538
				539	/* If a BPF register is on the stack (stk is true), load it to the
				540	* supplied temporary register and return the temporary register
				541	* for subsequent operations, otherwise just use the CPU register.
				542	*/
				543	static s8 arm_bpf_get_reg32(s8 reg, s8 tmp, struct jit_ctx *ctx)
				544	{
				545	if (is_stacked(reg)) {
				546	emit(ARM_LDR_I(tmp, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx);
				547	reg = tmp;
				548	}
				549	return reg;
				550	}
				551
				552	static const s8 arm_bpf_get_reg64(const s8 reg, const s8 *tmp,
				553	struct jit_ctx *ctx)
				554	{
				555	if (is_stacked(reg[1])) {
				556	if (__LINUX_ARM_ARCH__ >= 6 \|\|
				557	ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
				558	emit(ARM_LDRD_I(tmp[1], ARM_FP,
				559	EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
				560	} else {
				561	emit(ARM_LDR_I(tmp[1], ARM_FP,
				562	EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
				563	emit(ARM_LDR_I(tmp[0], ARM_FP,
				564	EBPF_SCRATCH_TO_ARM_FP(reg[0])), ctx);
				565	}
				566	reg = tmp;
				567	}
				568	return reg;
				569	}
				570
				571	/* If a BPF register is on the stack (stk is true), save the register
				572	* back to the stack. If the source register is not the same, then
				573	* move it into the correct register.
				574	*/
				575	static void arm_bpf_put_reg32(s8 reg, s8 src, struct jit_ctx *ctx)
				576	{
				577	if (is_stacked(reg))
				578	emit(ARM_STR_I(src, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx);
				579	else if (reg != src)
				580	emit(ARM_MOV_R(reg, src), ctx);
				581	}
				582
				583	static void arm_bpf_put_reg64(const s8 reg, const s8 src,
				584	struct jit_ctx *ctx)
				585	{
				586	if (is_stacked(reg[1])) {
				587	if (__LINUX_ARM_ARCH__ >= 6 \|\|
				588	ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
				589	emit(ARM_STRD_I(src[1], ARM_FP,
				590	EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
				591	} else {
				592	emit(ARM_STR_I(src[1], ARM_FP,
				593	EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
				594	emit(ARM_STR_I(src[0], ARM_FP,
				595	EBPF_SCRATCH_TO_ARM_FP(reg[0])), ctx);
				596	}
				597	} else {
				598	if (reg[1] != src[1])
				599	emit(ARM_MOV_R(reg[1], src[1]), ctx);
				600	if (reg[0] != src[0])
				601	emit(ARM_MOV_R(reg[0], src[0]), ctx);
				602	}
				603	}
				604
				605	static inline void emit_a32_mov_i(const s8 dst, const u32 val,
				606	struct jit_ctx *ctx)
				607	{
				608	const s8 *tmp = bpf2a32[TMP_REG_1];
				609
				610	if (is_stacked(dst)) {
				611	emit_mov_i(tmp[1], val, ctx);
				612	arm_bpf_put_reg32(dst, tmp[1], ctx);
				613	} else {
				614	emit_mov_i(dst, val, ctx);
				615	}
				616	}
				617
				618	static void emit_a32_mov_i64(const s8 dst[], u64 val, struct jit_ctx *ctx)
				619	{
				620	const s8 *tmp = bpf2a32[TMP_REG_1];
				621	const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
				622
				623	emit_mov_i(rd[1], (u32)val, ctx);
				624	emit_mov_i(rd[0], val >> 32, ctx);
				625
				626	arm_bpf_put_reg64(dst, rd, ctx);
				627	}
				628
				629	/* Sign extended move */
				630	static inline void emit_a32_mov_se_i64(const bool is64, const s8 dst[],
				631	const u32 val, struct jit_ctx *ctx) {
				632	u64 val64 = val;
				633
				634	if (is64 && (val & (1<<31)))
				635	val64 \|= 0xffffffff00000000ULL;
				636	emit_a32_mov_i64(dst, val64, ctx);
				637	}
				638
				639	static inline void emit_a32_add_r(const u8 dst, const u8 src,
				640	const bool is64, const bool hi,
				641	struct jit_ctx *ctx) {
				642	/* 64 bit :
				643	* adds dst_lo, dst_lo, src_lo
				644	* adc dst_hi, dst_hi, src_hi
				645	* 32 bit :
				646	* add dst_lo, dst_lo, src_lo
				647	*/
				648	if (!hi && is64)
				649	emit(ARM_ADDS_R(dst, dst, src), ctx);
				650	else if (hi && is64)
				651	emit(ARM_ADC_R(dst, dst, src), ctx);
				652	else
				653	emit(ARM_ADD_R(dst, dst, src), ctx);
				654	}
				655
				656	static inline void emit_a32_sub_r(const u8 dst, const u8 src,
				657	const bool is64, const bool hi,
				658	struct jit_ctx *ctx) {
				659	/* 64 bit :
				660	* subs dst_lo, dst_lo, src_lo
				661	* sbc dst_hi, dst_hi, src_hi
				662	* 32 bit :
				663	* sub dst_lo, dst_lo, src_lo
				664	*/
				665	if (!hi && is64)
				666	emit(ARM_SUBS_R(dst, dst, src), ctx);
				667	else if (hi && is64)
				668	emit(ARM_SBC_R(dst, dst, src), ctx);
				669	else
				670	emit(ARM_SUB_R(dst, dst, src), ctx);
				671	}
				672
				673	static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64,
				674	const bool hi, const u8 op, struct jit_ctx *ctx){
				675	switch (BPF_OP(op)) {
				676	/* dst = dst + src */
				677	case BPF_ADD:
				678	emit_a32_add_r(dst, src, is64, hi, ctx);
				679	break;
				680	/* dst = dst - src */
				681	case BPF_SUB:
				682	emit_a32_sub_r(dst, src, is64, hi, ctx);
				683	break;
				684	/* dst = dst \| src */
				685	case BPF_OR:
				686	emit(ARM_ORR_R(dst, dst, src), ctx);
				687	break;
				688	/* dst = dst & src */
				689	case BPF_AND:
				690	emit(ARM_AND_R(dst, dst, src), ctx);
				691	break;
				692	/* dst = dst ^ src */
				693	case BPF_XOR:
				694	emit(ARM_EOR_R(dst, dst, src), ctx);
				695	break;
				696	/* dst = dst * src */
				697	case BPF_MUL:
				698	emit(ARM_MUL(dst, dst, src), ctx);
				699	break;
				700	/* dst = dst << src */
				701	case BPF_LSH:
				702	emit(ARM_LSL_R(dst, dst, src), ctx);
				703	break;
				704	/* dst = dst >> src */
				705	case BPF_RSH:
				706	emit(ARM_LSR_R(dst, dst, src), ctx);
				707	break;
				708	/* dst = dst >> src (signed)*/
				709	case BPF_ARSH:
				710	emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx);
				711	break;
				712	}
				713	}
				714
				715	/* ALU operation (32 bit)
				716	* dst = dst (op) src
				717	*/
				718	static inline void emit_a32_alu_r(const s8 dst, const s8 src,
				719	struct jit_ctx *ctx, const bool is64,
				720	const bool hi, const u8 op) {
				721	const s8 *tmp = bpf2a32[TMP_REG_1];
				722	s8 rn, rd;
				723
				724	rn = arm_bpf_get_reg32(src, tmp[1], ctx);
				725	rd = arm_bpf_get_reg32(dst, tmp[0], ctx);
				726	/* ALU operation */
				727	emit_alu_r(rd, rn, is64, hi, op, ctx);
				728	arm_bpf_put_reg32(dst, rd, ctx);
				729	}
				730
				731	/* ALU operation (64 bit) */
				732	static inline void emit_a32_alu_r64(const bool is64, const s8 dst[],
				733	const s8 src[], struct jit_ctx *ctx,
				734	const u8 op) {
				735	const s8 *tmp = bpf2a32[TMP_REG_1];
				736	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				737	const s8 *rd;
				738
				739	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				740	if (is64) {
				741	const s8 *rs;
				742
				743	rs = arm_bpf_get_reg64(src, tmp2, ctx);
				744
				745	/* ALU operation */
				746	emit_alu_r(rd[1], rs[1], true, false, op, ctx);
				747	emit_alu_r(rd[0], rs[0], true, true, op, ctx);
				748	} else {
				749	s8 rs;
				750
				751	rs = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
				752
				753	/* ALU operation */
				754	emit_alu_r(rd[1], rs, true, false, op, ctx);
				755	if (!ctx->prog->aux->verifier_zext)
				756	emit_a32_mov_i(rd[0], 0, ctx);
				757	}
				758
				759	arm_bpf_put_reg64(dst, rd, ctx);
				760	}
				761
				762	/* dst = src (4 bytes)*/
				763	static inline void emit_a32_mov_r(const s8 dst, const s8 src,
				764	struct jit_ctx *ctx) {
				765	const s8 *tmp = bpf2a32[TMP_REG_1];
				766	s8 rt;
				767
				768	rt = arm_bpf_get_reg32(src, tmp[0], ctx);
				769	arm_bpf_put_reg32(dst, rt, ctx);
				770	}
				771
				772	/* dst = src */
				773	static inline void emit_a32_mov_r64(const bool is64, const s8 dst[],
				774	const s8 src[],
				775	struct jit_ctx *ctx) {
				776	if (!is64) {
				777	emit_a32_mov_r(dst_lo, src_lo, ctx);
				778	if (!ctx->prog->aux->verifier_zext)
				779	/* Zero out high 4 bytes */
				780	emit_a32_mov_i(dst_hi, 0, ctx);
				781	} else if (__LINUX_ARM_ARCH__ < 6 &&
				782	ctx->cpu_architecture < CPU_ARCH_ARMv5TE) {
				783	/* complete 8 byte move */
				784	emit_a32_mov_r(dst_lo, src_lo, ctx);
				785	emit_a32_mov_r(dst_hi, src_hi, ctx);
				786	} else if (is_stacked(src_lo) && is_stacked(dst_lo)) {
				787	const u8 *tmp = bpf2a32[TMP_REG_1];
				788
				789	emit(ARM_LDRD_I(tmp[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx);
				790	emit(ARM_STRD_I(tmp[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx);
				791	} else if (is_stacked(src_lo)) {
				792	emit(ARM_LDRD_I(dst[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx);
				793	} else if (is_stacked(dst_lo)) {
				794	emit(ARM_STRD_I(src[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx);
				795	} else {
				796	emit(ARM_MOV_R(dst[0], src[0]), ctx);
				797	emit(ARM_MOV_R(dst[1], src[1]), ctx);
				798	}
				799	}
				800
				801	/* Shift operations */
				802	static inline void emit_a32_alu_i(const s8 dst, const u32 val,
				803	struct jit_ctx *ctx, const u8 op) {
				804	const s8 *tmp = bpf2a32[TMP_REG_1];
				805	s8 rd;
				806
				807	rd = arm_bpf_get_reg32(dst, tmp[0], ctx);
				808
				809	/* Do shift operation */
				810	switch (op) {
				811	case BPF_LSH:
				812	emit(ARM_LSL_I(rd, rd, val), ctx);
				813	break;
				814	case BPF_RSH:
				815	emit(ARM_LSR_I(rd, rd, val), ctx);
				816	break;
				817	case BPF_NEG:
				818	emit(ARM_RSB_I(rd, rd, val), ctx);
				819	break;
				820	}
				821
				822	arm_bpf_put_reg32(dst, rd, ctx);
				823	}
				824
				825	/* dst = ~dst (64 bit) */
				826	static inline void emit_a32_neg64(const s8 dst[],
				827	struct jit_ctx *ctx){
				828	const s8 *tmp = bpf2a32[TMP_REG_1];
				829	const s8 *rd;
				830
				831	/* Setup Operand */
				832	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				833
				834	/* Do Negate Operation */
				835	emit(ARM_RSBS_I(rd[1], rd[1], 0), ctx);
				836	emit(ARM_RSC_I(rd[0], rd[0], 0), ctx);
				837
				838	arm_bpf_put_reg64(dst, rd, ctx);
				839	}
				840
				841	/* dst = dst << src */
				842	static inline void emit_a32_lsh_r64(const s8 dst[], const s8 src[],
				843	struct jit_ctx *ctx) {
				844	const s8 *tmp = bpf2a32[TMP_REG_1];
				845	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				846	const s8 *rd;
				847	s8 rt;
				848
				849	/* Setup Operands */
				850	rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
				851	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				852
				853	/* Do LSH operation */
				854	emit(ARM_SUB_I(ARM_IP, rt, 32), ctx);
				855	emit(ARM_RSB_I(tmp2[0], rt, 32), ctx);
				856	emit(ARM_MOV_SR(ARM_LR, rd[0], SRTYPE_ASL, rt), ctx);
				857	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[1], SRTYPE_ASL, ARM_IP), ctx);
				858	emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd[1], SRTYPE_LSR, tmp2[0]), ctx);
				859	emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_ASL, rt), ctx);
				860
				861	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
				862	arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
				863	}
				864
				865	/* dst = dst >> src (signed)*/
				866	static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
				867	struct jit_ctx *ctx) {
				868	const s8 *tmp = bpf2a32[TMP_REG_1];
				869	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				870	const s8 *rd;
				871	s8 rt;
				872
				873	/* Setup Operands */
				874	rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
				875	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				876
				877	/* Do the ARSH operation */
				878	emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
				879	emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
				880	emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
				881	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
				882	_emit(ARM_COND_MI, ARM_B(0), ctx);
				883	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
				884	emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx);
				885
				886	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
				887	arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
				888	}
				889
				890	/* dst = dst >> src */
				891	static inline void emit_a32_rsh_r64(const s8 dst[], const s8 src[],
				892	struct jit_ctx *ctx) {
				893	const s8 *tmp = bpf2a32[TMP_REG_1];
				894	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				895	const s8 *rd;
				896	s8 rt;
				897
				898	/* Setup Operands */
				899	rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
				900	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				901
				902	/* Do RSH operation */
				903	emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
				904	emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
				905	emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
				906	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
				907	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_LSR, tmp2[0]), ctx);
				908	emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_LSR, rt), ctx);
				909
				910	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
				911	arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
				912	}
				913
				914	/* dst = dst << val */
				915	static inline void emit_a32_lsh_i64(const s8 dst[],
				916	const u32 val, struct jit_ctx *ctx){
				917	const s8 *tmp = bpf2a32[TMP_REG_1];
				918	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				919	const s8 *rd;
				920
				921	/* Setup operands */
				922	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				923
				924	/* Do LSH operation */
				925	if (val < 32) {
				926	emit(ARM_MOV_SI(tmp2[0], rd[0], SRTYPE_ASL, val), ctx);
				927	emit(ARM_ORR_SI(rd[0], tmp2[0], rd[1], SRTYPE_LSR, 32 - val), ctx);
				928	emit(ARM_MOV_SI(rd[1], rd[1], SRTYPE_ASL, val), ctx);
				929	} else {
				930	if (val == 32)
				931	emit(ARM_MOV_R(rd[0], rd[1]), ctx);
				932	else
				933	emit(ARM_MOV_SI(rd[0], rd[1], SRTYPE_ASL, val - 32), ctx);
				934	emit(ARM_EOR_R(rd[1], rd[1], rd[1]), ctx);
				935	}
				936
				937	arm_bpf_put_reg64(dst, rd, ctx);
				938	}
				939
				940	/* dst = dst >> val */
				941	static inline void emit_a32_rsh_i64(const s8 dst[],
				942	const u32 val, struct jit_ctx *ctx) {
				943	const s8 *tmp = bpf2a32[TMP_REG_1];
				944	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				945	const s8 *rd;
				946
				947	/* Setup operands */
				948	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				949
				950	/* Do LSR operation */
				951	if (val == 0) {
				952	/* An immediate value of 0 encodes a shift amount of 32
				953	* for LSR. To shift by 0, don't do anything.
				954	*/
				955	} else if (val < 32) {
				956	emit(ARM_MOV_SI(tmp2[1], rd[1], SRTYPE_LSR, val), ctx);
				957	emit(ARM_ORR_SI(rd[1], tmp2[1], rd[0], SRTYPE_ASL, 32 - val), ctx);
				958	emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_LSR, val), ctx);
				959	} else if (val == 32) {
				960	emit(ARM_MOV_R(rd[1], rd[0]), ctx);
				961	emit(ARM_MOV_I(rd[0], 0), ctx);
				962	} else {
				963	emit(ARM_MOV_SI(rd[1], rd[0], SRTYPE_LSR, val - 32), ctx);
				964	emit(ARM_MOV_I(rd[0], 0), ctx);
				965	}
				966
				967	arm_bpf_put_reg64(dst, rd, ctx);
				968	}
				969
				970	/* dst = dst >> val (signed) */
				971	static inline void emit_a32_arsh_i64(const s8 dst[],
				972	const u32 val, struct jit_ctx *ctx){
				973	const s8 *tmp = bpf2a32[TMP_REG_1];
				974	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				975	const s8 *rd;
				976
				977	/* Setup operands */
				978	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				979
				980	/* Do ARSH operation */
				981	if (val == 0) {
				982	/* An immediate value of 0 encodes a shift amount of 32
				983	* for ASR. To shift by 0, don't do anything.
				984	*/
				985	} else if (val < 32) {
				986	emit(ARM_MOV_SI(tmp2[1], rd[1], SRTYPE_LSR, val), ctx);
				987	emit(ARM_ORR_SI(rd[1], tmp2[1], rd[0], SRTYPE_ASL, 32 - val), ctx);
				988	emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, val), ctx);
				989	} else if (val == 32) {
				990	emit(ARM_MOV_R(rd[1], rd[0]), ctx);
				991	emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, 31), ctx);
				992	} else {
				993	emit(ARM_MOV_SI(rd[1], rd[0], SRTYPE_ASR, val - 32), ctx);
				994	emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, 31), ctx);
				995	}
				996
				997	arm_bpf_put_reg64(dst, rd, ctx);
				998	}
				999
				1000	static inline void emit_a32_mul_r64(const s8 dst[], const s8 src[],
				1001	struct jit_ctx *ctx) {
				1002	const s8 *tmp = bpf2a32[TMP_REG_1];
				1003	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				1004	const s8 rd, rt;
				1005
				1006	/* Setup operands for multiplication */
				1007	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				1008	rt = arm_bpf_get_reg64(src, tmp2, ctx);
				1009
				1010	/* Do Multiplication */
				1011	emit(ARM_MUL(ARM_IP, rd[1], rt[0]), ctx);
				1012	emit(ARM_MUL(ARM_LR, rd[0], rt[1]), ctx);
				1013	emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx);
				1014
				1015	emit(ARM_UMULL(ARM_IP, rd[0], rd[1], rt[1]), ctx);
				1016	emit(ARM_ADD_R(rd[0], ARM_LR, rd[0]), ctx);
				1017
				1018	arm_bpf_put_reg32(dst_lo, ARM_IP, ctx);
				1019	arm_bpf_put_reg32(dst_hi, rd[0], ctx);
				1020	}
				1021
				1022	static bool is_ldst_imm(s16 off, const u8 size)
				1023	{
				1024	s16 off_max = 0;
				1025
				1026	switch (size) {
				1027	case BPF_B:
				1028	case BPF_W:
				1029	off_max = 0xfff;
				1030	break;
				1031	case BPF_H:
				1032	off_max = 0xff;
				1033	break;
				1034	case BPF_DW:
				1035	/* Need to make sure off+4 does not overflow. */
				1036	off_max = 0xfff - 4;
				1037	break;
				1038	}
				1039	return -off_max <= off && off <= off_max;
				1040	}
				1041
				1042	/* (size )(dst + off) = src */
				1043	static inline void emit_str_r(const s8 dst, const s8 src[],
				1044	s16 off, struct jit_ctx *ctx, const u8 sz){
				1045	const s8 *tmp = bpf2a32[TMP_REG_1];
				1046	s8 rd;
				1047
				1048	rd = arm_bpf_get_reg32(dst, tmp[1], ctx);
				1049
				1050	if (!is_ldst_imm(off, sz)) {
				1051	emit_a32_mov_i(tmp[0], off, ctx);
				1052	emit(ARM_ADD_R(tmp[0], tmp[0], rd), ctx);
				1053	rd = tmp[0];
				1054	off = 0;
				1055	}
				1056	switch (sz) {
				1057	case BPF_B:
				1058	/* Store a Byte */
				1059	emit(ARM_STRB_I(src_lo, rd, off), ctx);
				1060	break;
				1061	case BPF_H:
				1062	/* Store a HalfWord */
				1063	emit(ARM_STRH_I(src_lo, rd, off), ctx);
				1064	break;
				1065	case BPF_W:
				1066	/* Store a Word */
				1067	emit(ARM_STR_I(src_lo, rd, off), ctx);
				1068	break;
				1069	case BPF_DW:
				1070	/* Store a Double Word */
				1071	emit(ARM_STR_I(src_lo, rd, off), ctx);
				1072	emit(ARM_STR_I(src_hi, rd, off + 4), ctx);
				1073	break;
				1074	}
				1075	}
				1076
				1077	/* dst = (size)(src + off) */
				1078	static inline void emit_ldx_r(const s8 dst[], const s8 src,
				1079	s16 off, struct jit_ctx *ctx, const u8 sz){
				1080	const s8 *tmp = bpf2a32[TMP_REG_1];
				1081	const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
				1082	s8 rm = src;
				1083
				1084	if (!is_ldst_imm(off, sz)) {
				1085	emit_a32_mov_i(tmp[0], off, ctx);
				1086	emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx);
				1087	rm = tmp[0];
				1088	off = 0;
				1089	} else if (rd[1] == rm) {
				1090	emit(ARM_MOV_R(tmp[0], rm), ctx);
				1091	rm = tmp[0];
				1092	}
				1093	switch (sz) {
				1094	case BPF_B:
				1095	/* Load a Byte */
				1096	emit(ARM_LDRB_I(rd[1], rm, off), ctx);
				1097	if (!ctx->prog->aux->verifier_zext)
				1098	emit_a32_mov_i(rd[0], 0, ctx);
				1099	break;
				1100	case BPF_H:
				1101	/* Load a HalfWord */
				1102	emit(ARM_LDRH_I(rd[1], rm, off), ctx);
				1103	if (!ctx->prog->aux->verifier_zext)
				1104	emit_a32_mov_i(rd[0], 0, ctx);
				1105	break;
				1106	case BPF_W:
				1107	/* Load a Word */
				1108	emit(ARM_LDR_I(rd[1], rm, off), ctx);
				1109	if (!ctx->prog->aux->verifier_zext)
				1110	emit_a32_mov_i(rd[0], 0, ctx);
				1111	break;
				1112	case BPF_DW:
				1113	/* Load a Double Word */
				1114	emit(ARM_LDR_I(rd[1], rm, off), ctx);
				1115	emit(ARM_LDR_I(rd[0], rm, off + 4), ctx);
				1116	break;
				1117	}
				1118	arm_bpf_put_reg64(dst, rd, ctx);
				1119	}
				1120
				1121	/* Arithmatic Operation */
				1122	static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm,
				1123	const u8 rn, struct jit_ctx *ctx, u8 op,
				1124	bool is_jmp64) {
				1125	switch (op) {
				1126	case BPF_JSET:
				1127	if (is_jmp64) {
				1128	emit(ARM_AND_R(ARM_IP, rt, rn), ctx);
				1129	emit(ARM_AND_R(ARM_LR, rd, rm), ctx);
				1130	emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx);
				1131	} else {
				1132	emit(ARM_ANDS_R(ARM_IP, rt, rn), ctx);
				1133	}
				1134	break;
				1135	case BPF_JEQ:
				1136	case BPF_JNE:
				1137	case BPF_JGT:
				1138	case BPF_JGE:
				1139	case BPF_JLE:
				1140	case BPF_JLT:
				1141	if (is_jmp64) {
				1142	emit(ARM_CMP_R(rd, rm), ctx);
				1143	/* Only compare low halve if high halve are equal. */
				1144	_emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx);
				1145	} else {
				1146	emit(ARM_CMP_R(rt, rn), ctx);
				1147	}
				1148	break;
				1149	case BPF_JSLE:
				1150	case BPF_JSGT:
				1151	emit(ARM_CMP_R(rn, rt), ctx);
				1152	if (is_jmp64)
				1153	emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx);
				1154	break;
				1155	case BPF_JSLT:
				1156	case BPF_JSGE:
				1157	emit(ARM_CMP_R(rt, rn), ctx);
				1158	if (is_jmp64)
				1159	emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx);
				1160	break;
				1161	}
				1162	}
				1163
				1164	static int out_offset = -1; /* initialized on the first pass of build_body() */
				1165	static int emit_bpf_tail_call(struct jit_ctx *ctx)
				1166	{
				1167
				1168	/* bpf_tail_call(void prog_ctx, struct bpf_array array, u64 index) */
				1169	const s8 *r2 = bpf2a32[BPF_REG_2];
				1170	const s8 *r3 = bpf2a32[BPF_REG_3];
				1171	const s8 *tmp = bpf2a32[TMP_REG_1];
				1172	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				1173	const s8 *tcc = bpf2a32[TCALL_CNT];
				1174	const s8 *tc;
				1175	const int idx0 = ctx->idx;
				1176	#define cur_offset (ctx->idx - idx0)
				1177	#define jmp_offset (out_offset - (cur_offset) - 2)
				1178	u32 lo, hi;
				1179	s8 r_array, r_index;
				1180	int off;
				1181
				1182	/* if (index >= array->map.max_entries)
				1183	* goto out;
				1184	*/
				1185	BUILD_BUG_ON(offsetof(struct bpf_array, map.max_entries) >
				1186	ARM_INST_LDST__IMM12);
				1187	off = offsetof(struct bpf_array, map.max_entries);
				1188	r_array = arm_bpf_get_reg32(r2[1], tmp2[0], ctx);
				1189	/* index is 32-bit for arrays */
				1190	r_index = arm_bpf_get_reg32(r3[1], tmp2[1], ctx);
				1191	/* array->map.max_entries */
				1192	emit(ARM_LDR_I(tmp[1], r_array, off), ctx);
				1193	/* index >= array->map.max_entries */
				1194	emit(ARM_CMP_R(r_index, tmp[1]), ctx);
				1195	_emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
				1196
				1197	/* tmp2[0] = array, tmp2[1] = index */
				1198
				1199	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
				1200	* goto out;
				1201	* tail_call_cnt++;
				1202	*/
				1203	lo = (u32)MAX_TAIL_CALL_CNT;
				1204	hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
				1205	tc = arm_bpf_get_reg64(tcc, tmp, ctx);
				1206	emit(ARM_CMP_I(tc[0], hi), ctx);
				1207	_emit(ARM_COND_EQ, ARM_CMP_I(tc[1], lo), ctx);
				1208	_emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
				1209	emit(ARM_ADDS_I(tc[1], tc[1], 1), ctx);
				1210	emit(ARM_ADC_I(tc[0], tc[0], 0), ctx);
				1211	arm_bpf_put_reg64(tcc, tmp, ctx);
				1212
				1213	/* prog = array->ptrs[index]
				1214	* if (prog == NULL)
				1215	* goto out;
				1216	*/
				1217	BUILD_BUG_ON(imm8m(offsetof(struct bpf_array, ptrs)) < 0);
				1218	off = imm8m(offsetof(struct bpf_array, ptrs));
				1219	emit(ARM_ADD_I(tmp[1], r_array, off), ctx);
				1220	emit(ARM_LDR_R_SI(tmp[1], tmp[1], r_index, SRTYPE_ASL, 2), ctx);
				1221	emit(ARM_CMP_I(tmp[1], 0), ctx);
				1222	_emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
				1223
				1224	/* goto (prog->bpf_func + prologue_size); /
				1225	BUILD_BUG_ON(offsetof(struct bpf_prog, bpf_func) >
				1226	ARM_INST_LDST__IMM12);
				1227	off = offsetof(struct bpf_prog, bpf_func);
				1228	emit(ARM_LDR_I(tmp[1], tmp[1], off), ctx);
				1229	emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx);
				1230	emit_bx_r(tmp[1], ctx);
				1231
				1232	/* out: */
				1233	if (out_offset == -1)
				1234	out_offset = cur_offset;
				1235	if (cur_offset != out_offset) {
				1236	pr_err_once("tail_call out_offset = %d, expected %d!\n",
				1237	cur_offset, out_offset);
				1238	return -1;
				1239	}
				1240	return 0;
				1241	#undef cur_offset
				1242	#undef jmp_offset
				1243	}
				1244
				1245	/* 0xabcd => 0xcdab */
				1246	static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx)
				1247	{
				1248	#if __LINUX_ARM_ARCH__ < 6
				1249	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				1250
				1251	emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx);
				1252	emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 8), ctx);
				1253	emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx);
				1254	emit(ARM_ORR_SI(rd, tmp2[0], tmp2[1], SRTYPE_LSL, 8), ctx);
				1255	#else /* ARMv6+ */
				1256	emit(ARM_REV16(rd, rn), ctx);
				1257	#endif
				1258	}
				1259
				1260	/* 0xabcdefgh => 0xghefcdab */
				1261	static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx)
				1262	{
				1263	#if __LINUX_ARM_ARCH__ < 6
				1264	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				1265
				1266	emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx);
				1267	emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 24), ctx);
				1268	emit(ARM_ORR_SI(ARM_IP, tmp2[0], tmp2[1], SRTYPE_LSL, 24), ctx);
				1269
				1270	emit(ARM_MOV_SI(tmp2[1], rn, SRTYPE_LSR, 8), ctx);
				1271	emit(ARM_AND_I(tmp2[1], tmp2[1], 0xff), ctx);
				1272	emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 16), ctx);
				1273	emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx);
				1274	emit(ARM_MOV_SI(tmp2[0], tmp2[0], SRTYPE_LSL, 8), ctx);
				1275	emit(ARM_ORR_SI(tmp2[0], tmp2[0], tmp2[1], SRTYPE_LSL, 16), ctx);
				1276	emit(ARM_ORR_R(rd, ARM_IP, tmp2[0]), ctx);
				1277
				1278	#else /* ARMv6+ */
				1279	emit(ARM_REV(rd, rn), ctx);
				1280	#endif
				1281	}
				1282
				1283	// push the scratch stack register on top of the stack
				1284	static inline void emit_push_r64(const s8 src[], struct jit_ctx *ctx)
				1285	{
				1286	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				1287	const s8 *rt;
				1288	u16 reg_set = 0;
				1289
				1290	rt = arm_bpf_get_reg64(src, tmp2, ctx);
				1291
				1292	reg_set = (1 << rt[1]) \| (1 << rt[0]);
				1293	emit(ARM_PUSH(reg_set), ctx);
				1294	}
				1295
				1296	static void build_prologue(struct jit_ctx *ctx)
				1297	{
				1298	const s8 r0 = bpf2a32[BPF_REG_0][1];
				1299	const s8 r2 = bpf2a32[BPF_REG_1][1];
				1300	const s8 r3 = bpf2a32[BPF_REG_1][0];
				1301	const s8 r4 = bpf2a32[BPF_REG_6][1];
				1302	const s8 fplo = bpf2a32[BPF_REG_FP][1];
				1303	const s8 fphi = bpf2a32[BPF_REG_FP][0];
				1304	const s8 *tcc = bpf2a32[TCALL_CNT];
				1305
				1306	/* Save callee saved registers. */
				1307	#ifdef CONFIG_FRAME_POINTER
				1308	u16 reg_set = CALLEE_PUSH_MASK \| 1 << ARM_IP \| 1 << ARM_PC;
				1309	emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
				1310	emit(ARM_PUSH(reg_set), ctx);
				1311	emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
				1312	#else
				1313	emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx);
				1314	emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx);
				1315	#endif
				1316	/* Save frame pointer for later */
				1317	emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx);
				1318
				1319	ctx->stack_size = imm8m(STACK_SIZE);
				1320
				1321	/* Set up function call stack */
				1322	emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx);
				1323
				1324	/* Set up BPF prog stack base register */
				1325	emit_a32_mov_r(fplo, ARM_IP, ctx);
				1326	emit_a32_mov_i(fphi, 0, ctx);
				1327
				1328	/* mov r4, 0 */
				1329	emit(ARM_MOV_I(r4, 0), ctx);
				1330
				1331	/* Move BPF_CTX to BPF_R1 */
				1332	emit(ARM_MOV_R(r3, r4), ctx);
				1333	emit(ARM_MOV_R(r2, r0), ctx);
				1334	/* Initialize Tail Count */
				1335	emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[0])), ctx);
				1336	emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[1])), ctx);
				1337	/* end of prologue */
				1338	}
				1339
				1340	/* restore callee saved registers. */
				1341	static void build_epilogue(struct jit_ctx *ctx)
				1342	{
				1343	#ifdef CONFIG_FRAME_POINTER
				1344	/* When using frame pointers, some additional registers need to
				1345	* be loaded. */
				1346	u16 reg_set = CALLEE_POP_MASK \| 1 << ARM_SP;
				1347	emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * 4), ctx);
				1348	emit(ARM_LDM(ARM_SP, reg_set), ctx);
				1349	#else
				1350	/* Restore callee saved registers. */
				1351	emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx);
				1352	emit(ARM_POP(CALLEE_POP_MASK), ctx);
				1353	#endif
				1354	}
				1355
				1356	/*
				1357	* Convert an eBPF instruction to native instruction, i.e
				1358	* JITs an eBPF instruction.
				1359	* Returns :
				1360	* 0 - Successfully JITed an 8-byte eBPF instruction
				1361	* >0 - Successfully JITed a 16-byte eBPF instruction
				1362	* <0 - Failed to JIT.
				1363	*/
				1364	static int build_insn(const struct bpf_insn insn, struct jit_ctx ctx)
				1365	{
				1366	const u8 code = insn->code;
				1367	const s8 *dst = bpf2a32[insn->dst_reg];
				1368	const s8 *src = bpf2a32[insn->src_reg];
				1369	const s8 *tmp = bpf2a32[TMP_REG_1];
				1370	const s8 *tmp2 = bpf2a32[TMP_REG_2];
				1371	const s16 off = insn->off;
				1372	const s32 imm = insn->imm;
				1373	const int i = insn - ctx->prog->insnsi;
				1374	const bool is64 = BPF_CLASS(code) == BPF_ALU64;
				1375	const s8 rd, rs;
				1376	s8 rd_lo, rt, rm, rn;
				1377	s32 jmp_offset;
				1378
				1379	#define check_imm(bits, imm) do { \
				1380	if ((imm) >= (1 << ((bits) - 1)) \|\| \
				1381	(imm) < -(1 << ((bits) - 1))) { \
				1382	pr_info("[%2d] imm=%d(0x%x) out of range\n", \
				1383	i, imm, imm); \
				1384	return -EINVAL; \
				1385	} \
				1386	} while (0)
				1387	#define check_imm24(imm) check_imm(24, imm)
				1388
				1389	switch (code) {
				1390	/* ALU operations */
				1391
				1392	/* dst = src */
				1393	case BPF_ALU \| BPF_MOV \| BPF_K:
				1394	case BPF_ALU \| BPF_MOV \| BPF_X:
				1395	case BPF_ALU64 \| BPF_MOV \| BPF_K:
				1396	case BPF_ALU64 \| BPF_MOV \| BPF_X:
				1397	switch (BPF_SRC(code)) {
				1398	case BPF_X:
				1399	if (imm == 1) {
				1400	/* Special mov32 for zext */
				1401	emit_a32_mov_i(dst_hi, 0, ctx);
				1402	break;
				1403	}
				1404	emit_a32_mov_r64(is64, dst, src, ctx);
				1405	break;
				1406	case BPF_K:
				1407	/* Sign-extend immediate value to destination reg */
				1408	emit_a32_mov_se_i64(is64, dst, imm, ctx);
				1409	break;
				1410	}
				1411	break;
				1412	/* dst = dst + src/imm */
				1413	/* dst = dst - src/imm */
				1414	/* dst = dst \| src/imm */
				1415	/* dst = dst & src/imm */
				1416	/* dst = dst ^ src/imm */
				1417	/* dst = dst * src/imm */
				1418	/* dst = dst << src */
				1419	/* dst = dst >> src */
				1420	case BPF_ALU \| BPF_ADD \| BPF_K:
				1421	case BPF_ALU \| BPF_ADD \| BPF_X:
				1422	case BPF_ALU \| BPF_SUB \| BPF_K:
				1423	case BPF_ALU \| BPF_SUB \| BPF_X:
				1424	case BPF_ALU \| BPF_OR \| BPF_K:
				1425	case BPF_ALU \| BPF_OR \| BPF_X:
				1426	case BPF_ALU \| BPF_AND \| BPF_K:
				1427	case BPF_ALU \| BPF_AND \| BPF_X:
				1428	case BPF_ALU \| BPF_XOR \| BPF_K:
				1429	case BPF_ALU \| BPF_XOR \| BPF_X:
				1430	case BPF_ALU \| BPF_MUL \| BPF_K:
				1431	case BPF_ALU \| BPF_MUL \| BPF_X:
				1432	case BPF_ALU \| BPF_LSH \| BPF_X:
				1433	case BPF_ALU \| BPF_RSH \| BPF_X:
				1434	case BPF_ALU \| BPF_ARSH \| BPF_K:
				1435	case BPF_ALU \| BPF_ARSH \| BPF_X:
				1436	case BPF_ALU64 \| BPF_ADD \| BPF_K:
				1437	case BPF_ALU64 \| BPF_ADD \| BPF_X:
				1438	case BPF_ALU64 \| BPF_SUB \| BPF_K:
				1439	case BPF_ALU64 \| BPF_SUB \| BPF_X:
				1440	case BPF_ALU64 \| BPF_OR \| BPF_K:
				1441	case BPF_ALU64 \| BPF_OR \| BPF_X:
				1442	case BPF_ALU64 \| BPF_AND \| BPF_K:
				1443	case BPF_ALU64 \| BPF_AND \| BPF_X:
				1444	case BPF_ALU64 \| BPF_XOR \| BPF_K:
				1445	case BPF_ALU64 \| BPF_XOR \| BPF_X:
				1446	switch (BPF_SRC(code)) {
				1447	case BPF_X:
				1448	emit_a32_alu_r64(is64, dst, src, ctx, BPF_OP(code));
				1449	break;
				1450	case BPF_K:
				1451	/* Move immediate value to the temporary register
				1452	* and then do the ALU operation on the temporary
				1453	* register as this will sign-extend the immediate
				1454	* value into temporary reg and then it would be
				1455	* safe to do the operation on it.
				1456	*/
				1457	emit_a32_mov_se_i64(is64, tmp2, imm, ctx);
				1458	emit_a32_alu_r64(is64, dst, tmp2, ctx, BPF_OP(code));
				1459	break;
				1460	}
				1461	break;
				1462	/* dst = dst / src(imm) */
				1463	/* dst = dst % src(imm) */
				1464	case BPF_ALU \| BPF_DIV \| BPF_K:
				1465	case BPF_ALU \| BPF_DIV \| BPF_X:
				1466	case BPF_ALU \| BPF_MOD \| BPF_K:
				1467	case BPF_ALU \| BPF_MOD \| BPF_X:
				1468	rd_lo = arm_bpf_get_reg32(dst_lo, tmp2[1], ctx);
				1469	switch (BPF_SRC(code)) {
				1470	case BPF_X:
				1471	rt = arm_bpf_get_reg32(src_lo, tmp2[0], ctx);
				1472	break;
				1473	case BPF_K:
				1474	rt = tmp2[0];
				1475	emit_a32_mov_i(rt, imm, ctx);
				1476	break;
				1477	default:
				1478	rt = src_lo;
				1479	break;
				1480	}
				1481	emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code));
				1482	arm_bpf_put_reg32(dst_lo, rd_lo, ctx);
				1483	if (!ctx->prog->aux->verifier_zext)
				1484	emit_a32_mov_i(dst_hi, 0, ctx);
				1485	break;
				1486	case BPF_ALU64 \| BPF_DIV \| BPF_K:
				1487	case BPF_ALU64 \| BPF_DIV \| BPF_X:
				1488	case BPF_ALU64 \| BPF_MOD \| BPF_K:
				1489	case BPF_ALU64 \| BPF_MOD \| BPF_X:
				1490	goto notyet;
				1491	/* dst = dst >> imm */
				1492	/* dst = dst << imm */
				1493	case BPF_ALU \| BPF_RSH \| BPF_K:
				1494	case BPF_ALU \| BPF_LSH \| BPF_K:
				1495	if (unlikely(imm > 31))
				1496	return -EINVAL;
				1497	if (imm)
				1498	emit_a32_alu_i(dst_lo, imm, ctx, BPF_OP(code));
				1499	if (!ctx->prog->aux->verifier_zext)
				1500	emit_a32_mov_i(dst_hi, 0, ctx);
				1501	break;
				1502	/* dst = dst << imm */
				1503	case BPF_ALU64 \| BPF_LSH \| BPF_K:
				1504	if (unlikely(imm > 63))
				1505	return -EINVAL;
				1506	emit_a32_lsh_i64(dst, imm, ctx);
				1507	break;
				1508	/* dst = dst >> imm */
				1509	case BPF_ALU64 \| BPF_RSH \| BPF_K:
				1510	if (unlikely(imm > 63))
				1511	return -EINVAL;
				1512	emit_a32_rsh_i64(dst, imm, ctx);
				1513	break;
				1514	/* dst = dst << src */
				1515	case BPF_ALU64 \| BPF_LSH \| BPF_X:
				1516	emit_a32_lsh_r64(dst, src, ctx);
				1517	break;
				1518	/* dst = dst >> src */
				1519	case BPF_ALU64 \| BPF_RSH \| BPF_X:
				1520	emit_a32_rsh_r64(dst, src, ctx);
				1521	break;
				1522	/* dst = dst >> src (signed) */
				1523	case BPF_ALU64 \| BPF_ARSH \| BPF_X:
				1524	emit_a32_arsh_r64(dst, src, ctx);
				1525	break;
				1526	/* dst = dst >> imm (signed) */
				1527	case BPF_ALU64 \| BPF_ARSH \| BPF_K:
				1528	if (unlikely(imm > 63))
				1529	return -EINVAL;
				1530	emit_a32_arsh_i64(dst, imm, ctx);
				1531	break;
				1532	/* dst = ~dst */
				1533	case BPF_ALU \| BPF_NEG:
				1534	emit_a32_alu_i(dst_lo, 0, ctx, BPF_OP(code));
				1535	if (!ctx->prog->aux->verifier_zext)
				1536	emit_a32_mov_i(dst_hi, 0, ctx);
				1537	break;
				1538	/* dst = ~dst (64 bit) */
				1539	case BPF_ALU64 \| BPF_NEG:
				1540	emit_a32_neg64(dst, ctx);
				1541	break;
				1542	/* dst = dst * src/imm */
				1543	case BPF_ALU64 \| BPF_MUL \| BPF_X:
				1544	case BPF_ALU64 \| BPF_MUL \| BPF_K:
				1545	switch (BPF_SRC(code)) {
				1546	case BPF_X:
				1547	emit_a32_mul_r64(dst, src, ctx);
				1548	break;
				1549	case BPF_K:
				1550	/* Move immediate value to the temporary register
				1551	* and then do the multiplication on it as this
				1552	* will sign-extend the immediate value into temp
				1553	* reg then it would be safe to do the operation
				1554	* on it.
				1555	*/
				1556	emit_a32_mov_se_i64(is64, tmp2, imm, ctx);
				1557	emit_a32_mul_r64(dst, tmp2, ctx);
				1558	break;
				1559	}
				1560	break;
				1561	/* dst = htole(dst) */
				1562	/* dst = htobe(dst) */
				1563	case BPF_ALU \| BPF_END \| BPF_FROM_LE:
				1564	case BPF_ALU \| BPF_END \| BPF_FROM_BE:
				1565	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				1566	if (BPF_SRC(code) == BPF_FROM_LE)
				1567	goto emit_bswap_uxt;
				1568	switch (imm) {
				1569	case 16:
				1570	emit_rev16(rd[1], rd[1], ctx);
				1571	goto emit_bswap_uxt;
				1572	case 32:
				1573	emit_rev32(rd[1], rd[1], ctx);
				1574	goto emit_bswap_uxt;
				1575	case 64:
				1576	emit_rev32(ARM_LR, rd[1], ctx);
				1577	emit_rev32(rd[1], rd[0], ctx);
				1578	emit(ARM_MOV_R(rd[0], ARM_LR), ctx);
				1579	break;
				1580	}
				1581	goto exit;
				1582	emit_bswap_uxt:
				1583	switch (imm) {
				1584	case 16:
				1585	/* zero-extend 16 bits into 64 bits */
				1586	#if __LINUX_ARM_ARCH__ < 6
				1587	emit_a32_mov_i(tmp2[1], 0xffff, ctx);
				1588	emit(ARM_AND_R(rd[1], rd[1], tmp2[1]), ctx);
				1589	#else /* ARMv6+ */
				1590	emit(ARM_UXTH(rd[1], rd[1]), ctx);
				1591	#endif
				1592	if (!ctx->prog->aux->verifier_zext)
				1593	emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx);
				1594	break;
				1595	case 32:
				1596	/* zero-extend 32 bits into 64 bits */
				1597	if (!ctx->prog->aux->verifier_zext)
				1598	emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx);
				1599	break;
				1600	case 64:
				1601	/* nop */
				1602	break;
				1603	}
				1604	exit:
				1605	arm_bpf_put_reg64(dst, rd, ctx);
				1606	break;
				1607	/* dst = imm64 */
				1608	case BPF_LD \| BPF_IMM \| BPF_DW:
				1609	{
				1610	u64 val = (u32)imm \| (u64)insn[1].imm << 32;
				1611
				1612	emit_a32_mov_i64(dst, val, ctx);
				1613
				1614	return 1;
				1615	}
				1616	/* LDX: dst = (size )(src + off) */
				1617	case BPF_LDX \| BPF_MEM \| BPF_W:
				1618	case BPF_LDX \| BPF_MEM \| BPF_H:
				1619	case BPF_LDX \| BPF_MEM \| BPF_B:
				1620	case BPF_LDX \| BPF_MEM \| BPF_DW:
				1621	rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
				1622	emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
				1623	break;
				1624	/* speculation barrier */
				1625	case BPF_ST \| BPF_NOSPEC:
				1626	break;
				1627	/* ST: (size )(dst + off) = imm */
				1628	case BPF_ST \| BPF_MEM \| BPF_W:
				1629	case BPF_ST \| BPF_MEM \| BPF_H:
				1630	case BPF_ST \| BPF_MEM \| BPF_B:
				1631	case BPF_ST \| BPF_MEM \| BPF_DW:
				1632	switch (BPF_SIZE(code)) {
				1633	case BPF_DW:
				1634	/* Sign-extend immediate value into temp reg */
				1635	emit_a32_mov_se_i64(true, tmp2, imm, ctx);
				1636	break;
				1637	case BPF_W:
				1638	case BPF_H:
				1639	case BPF_B:
				1640	emit_a32_mov_i(tmp2[1], imm, ctx);
				1641	break;
				1642	}
				1643	emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code));
				1644	break;
				1645	/* STX XADD: lock (u32 )(dst + off) += src */
				1646	case BPF_STX \| BPF_XADD \| BPF_W:
				1647	/* STX XADD: lock (u64 )(dst + off) += src */
				1648	case BPF_STX \| BPF_XADD \| BPF_DW:
				1649	goto notyet;
				1650	/* STX: (size )(dst + off) = src */
				1651	case BPF_STX \| BPF_MEM \| BPF_W:
				1652	case BPF_STX \| BPF_MEM \| BPF_H:
				1653	case BPF_STX \| BPF_MEM \| BPF_B:
				1654	case BPF_STX \| BPF_MEM \| BPF_DW:
				1655	rs = arm_bpf_get_reg64(src, tmp2, ctx);
				1656	emit_str_r(dst_lo, rs, off, ctx, BPF_SIZE(code));
				1657	break;
				1658	/* PC += off if dst == src */
				1659	/* PC += off if dst > src */
				1660	/* PC += off if dst >= src */
				1661	/* PC += off if dst < src */
				1662	/* PC += off if dst <= src */
				1663	/* PC += off if dst != src */
				1664	/* PC += off if dst > src (signed) */
				1665	/* PC += off if dst >= src (signed) */
				1666	/* PC += off if dst < src (signed) */
				1667	/* PC += off if dst <= src (signed) */
				1668	/* PC += off if dst & src */
				1669	case BPF_JMP \| BPF_JEQ \| BPF_X:
				1670	case BPF_JMP \| BPF_JGT \| BPF_X:
				1671	case BPF_JMP \| BPF_JGE \| BPF_X:
				1672	case BPF_JMP \| BPF_JNE \| BPF_X:
				1673	case BPF_JMP \| BPF_JSGT \| BPF_X:
				1674	case BPF_JMP \| BPF_JSGE \| BPF_X:
				1675	case BPF_JMP \| BPF_JSET \| BPF_X:
				1676	case BPF_JMP \| BPF_JLE \| BPF_X:
				1677	case BPF_JMP \| BPF_JLT \| BPF_X:
				1678	case BPF_JMP \| BPF_JSLT \| BPF_X:
				1679	case BPF_JMP \| BPF_JSLE \| BPF_X:
				1680	case BPF_JMP32 \| BPF_JEQ \| BPF_X:
				1681	case BPF_JMP32 \| BPF_JGT \| BPF_X:
				1682	case BPF_JMP32 \| BPF_JGE \| BPF_X:
				1683	case BPF_JMP32 \| BPF_JNE \| BPF_X:
				1684	case BPF_JMP32 \| BPF_JSGT \| BPF_X:
				1685	case BPF_JMP32 \| BPF_JSGE \| BPF_X:
				1686	case BPF_JMP32 \| BPF_JSET \| BPF_X:
				1687	case BPF_JMP32 \| BPF_JLE \| BPF_X:
				1688	case BPF_JMP32 \| BPF_JLT \| BPF_X:
				1689	case BPF_JMP32 \| BPF_JSLT \| BPF_X:
				1690	case BPF_JMP32 \| BPF_JSLE \| BPF_X:
				1691	/* Setup source registers */
				1692	rm = arm_bpf_get_reg32(src_hi, tmp2[0], ctx);
				1693	rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
				1694	goto go_jmp;
				1695	/* PC += off if dst == imm */
				1696	/* PC += off if dst > imm */
				1697	/* PC += off if dst >= imm */
				1698	/* PC += off if dst < imm */
				1699	/* PC += off if dst <= imm */
				1700	/* PC += off if dst != imm */
				1701	/* PC += off if dst > imm (signed) */
				1702	/* PC += off if dst >= imm (signed) */
				1703	/* PC += off if dst < imm (signed) */
				1704	/* PC += off if dst <= imm (signed) */
				1705	/* PC += off if dst & imm */
				1706	case BPF_JMP \| BPF_JEQ \| BPF_K:
				1707	case BPF_JMP \| BPF_JGT \| BPF_K:
				1708	case BPF_JMP \| BPF_JGE \| BPF_K:
				1709	case BPF_JMP \| BPF_JNE \| BPF_K:
				1710	case BPF_JMP \| BPF_JSGT \| BPF_K:
				1711	case BPF_JMP \| BPF_JSGE \| BPF_K:
				1712	case BPF_JMP \| BPF_JSET \| BPF_K:
				1713	case BPF_JMP \| BPF_JLT \| BPF_K:
				1714	case BPF_JMP \| BPF_JLE \| BPF_K:
				1715	case BPF_JMP \| BPF_JSLT \| BPF_K:
				1716	case BPF_JMP \| BPF_JSLE \| BPF_K:
				1717	case BPF_JMP32 \| BPF_JEQ \| BPF_K:
				1718	case BPF_JMP32 \| BPF_JGT \| BPF_K:
				1719	case BPF_JMP32 \| BPF_JGE \| BPF_K:
				1720	case BPF_JMP32 \| BPF_JNE \| BPF_K:
				1721	case BPF_JMP32 \| BPF_JSGT \| BPF_K:
				1722	case BPF_JMP32 \| BPF_JSGE \| BPF_K:
				1723	case BPF_JMP32 \| BPF_JSET \| BPF_K:
				1724	case BPF_JMP32 \| BPF_JLT \| BPF_K:
				1725	case BPF_JMP32 \| BPF_JLE \| BPF_K:
				1726	case BPF_JMP32 \| BPF_JSLT \| BPF_K:
				1727	case BPF_JMP32 \| BPF_JSLE \| BPF_K:
				1728	if (off == 0)
				1729	break;
				1730	rm = tmp2[0];
				1731	rn = tmp2[1];
				1732	/* Sign-extend immediate value */
				1733	emit_a32_mov_se_i64(true, tmp2, imm, ctx);
				1734	go_jmp:
				1735	/* Setup destination register */
				1736	rd = arm_bpf_get_reg64(dst, tmp, ctx);
				1737
				1738	/* Check for the condition */
				1739	emit_ar_r(rd[0], rd[1], rm, rn, ctx, BPF_OP(code),
				1740	BPF_CLASS(code) == BPF_JMP);
				1741
				1742	/* Setup JUMP instruction */
				1743	jmp_offset = bpf2a32_offset(i+off, i, ctx);
				1744	switch (BPF_OP(code)) {
				1745	case BPF_JNE:
				1746	case BPF_JSET:
				1747	_emit(ARM_COND_NE, ARM_B(jmp_offset), ctx);
				1748	break;
				1749	case BPF_JEQ:
				1750	_emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
				1751	break;
				1752	case BPF_JGT:
				1753	_emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
				1754	break;
				1755	case BPF_JGE:
				1756	_emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
				1757	break;
				1758	case BPF_JSGT:
				1759	_emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
				1760	break;
				1761	case BPF_JSGE:
				1762	_emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
				1763	break;
				1764	case BPF_JLE:
				1765	_emit(ARM_COND_LS, ARM_B(jmp_offset), ctx);
				1766	break;
				1767	case BPF_JLT:
				1768	_emit(ARM_COND_CC, ARM_B(jmp_offset), ctx);
				1769	break;
				1770	case BPF_JSLT:
				1771	_emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
				1772	break;
				1773	case BPF_JSLE:
				1774	_emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
				1775	break;
				1776	}
				1777	break;
				1778	/* JMP OFF */
				1779	case BPF_JMP \| BPF_JA:
				1780	{
				1781	if (off == 0)
				1782	break;
				1783	jmp_offset = bpf2a32_offset(i+off, i, ctx);
				1784	check_imm24(jmp_offset);
				1785	emit(ARM_B(jmp_offset), ctx);
				1786	break;
				1787	}
				1788	/* tail call */
				1789	case BPF_JMP \| BPF_TAIL_CALL:
				1790	if (emit_bpf_tail_call(ctx))
				1791	return -EFAULT;
				1792	break;
				1793	/* function call */
				1794	case BPF_JMP \| BPF_CALL:
				1795	{
				1796	const s8 *r0 = bpf2a32[BPF_REG_0];
				1797	const s8 *r1 = bpf2a32[BPF_REG_1];
				1798	const s8 *r2 = bpf2a32[BPF_REG_2];
				1799	const s8 *r3 = bpf2a32[BPF_REG_3];
				1800	const s8 *r4 = bpf2a32[BPF_REG_4];
				1801	const s8 *r5 = bpf2a32[BPF_REG_5];
				1802	const u32 func = (u32)__bpf_call_base + (u32)imm;
				1803
				1804	emit_a32_mov_r64(true, r0, r1, ctx);
				1805	emit_a32_mov_r64(true, r1, r2, ctx);
				1806	emit_push_r64(r5, ctx);
				1807	emit_push_r64(r4, ctx);
				1808	emit_push_r64(r3, ctx);
				1809
				1810	emit_a32_mov_i(tmp[1], func, ctx);
				1811	emit_blx_r(tmp[1], ctx);
				1812
				1813	emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(24)), ctx); // callee clean
				1814	break;
				1815	}
				1816	/* function return */
				1817	case BPF_JMP \| BPF_EXIT:
				1818	/* Optimization: when last instruction is EXIT
				1819	* simply fallthrough to epilogue.
				1820	*/
				1821	if (i == ctx->prog->len - 1)
				1822	break;
				1823	jmp_offset = epilogue_offset(ctx);
				1824	check_imm24(jmp_offset);
				1825	emit(ARM_B(jmp_offset), ctx);
				1826	break;
				1827	notyet:
				1828	pr_info_once("* NOT YET: opcode %02x *\n", code);
				1829	return -EFAULT;
				1830	default:
				1831	pr_err_once("unknown opcode %02x\n", code);
				1832	return -EINVAL;
				1833	}
				1834
				1835	if (ctx->flags & FLAG_IMM_OVERFLOW)
				1836	/*
				1837	* this instruction generated an overflow when
				1838	* trying to access the literal pool, so
				1839	* delegate this filter to the kernel interpreter.
				1840	*/
				1841	return -1;
				1842	return 0;
				1843	}
				1844
				1845	static int build_body(struct jit_ctx *ctx)
				1846	{
				1847	const struct bpf_prog *prog = ctx->prog;
				1848	unsigned int i;
				1849
				1850	for (i = 0; i < prog->len; i++) {
				1851	const struct bpf_insn *insn = &(prog->insnsi[i]);
				1852	int ret;
				1853
				1854	ret = build_insn(insn, ctx);
				1855
				1856	/* It's used with loading the 64 bit immediate value. */
				1857	if (ret > 0) {
				1858	i++;
				1859	if (ctx->target == NULL)
				1860	ctx->offsets[i] = ctx->idx;
				1861	continue;
				1862	}
				1863
				1864	if (ctx->target == NULL)
				1865	ctx->offsets[i] = ctx->idx;
				1866
				1867	/* If unsuccesfull, return with error code */
				1868	if (ret)
				1869	return ret;
				1870	}
				1871	return 0;
				1872	}
				1873
				1874	static int validate_code(struct jit_ctx *ctx)
				1875	{
				1876	int i;
				1877
				1878	for (i = 0; i < ctx->idx; i++) {
				1879	if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF))
				1880	return -1;
				1881	}
				1882
				1883	return 0;
				1884	}
				1885
				1886	void bpf_jit_compile(struct bpf_prog *prog)
				1887	{
				1888	/* Nothing to do here. We support Internal BPF. */
				1889	}
				1890
				1891	bool bpf_jit_needs_zext(void)
				1892	{
				1893	return true;
				1894	}
				1895
				1896	struct bpf_prog bpf_int_jit_compile(struct bpf_prog prog)
				1897	{
				1898	struct bpf_prog tmp, orig_prog = prog;
				1899	struct bpf_binary_header *header;
				1900	bool tmp_blinded = false;
				1901	struct jit_ctx ctx;
				1902	unsigned int tmp_idx;
				1903	unsigned int image_size;
				1904	u8 *image_ptr;
				1905
				1906	/* If BPF JIT was not enabled then we must fall back to
				1907	* the interpreter.
				1908	*/
				1909	if (!prog->jit_requested)
				1910	return orig_prog;
				1911
				1912	/* If constant blinding was enabled and we failed during blinding
				1913	* then we must fall back to the interpreter. Otherwise, we save
				1914	* the new JITed code.
				1915	*/
				1916	tmp = bpf_jit_blind_constants(prog);
				1917
				1918	if (IS_ERR(tmp))
				1919	return orig_prog;
				1920	if (tmp != prog) {
				1921	tmp_blinded = true;
				1922	prog = tmp;
				1923	}
				1924
				1925	memset(&ctx, 0, sizeof(ctx));
				1926	ctx.prog = prog;
				1927	ctx.cpu_architecture = cpu_architecture();
				1928
				1929	/* Not able to allocate memory for offsets[] , then
				1930	* we must fall back to the interpreter
				1931	*/
				1932	ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
				1933	if (ctx.offsets == NULL) {
				1934	prog = orig_prog;
				1935	goto out;
				1936	}
				1937
				1938	/* 1) fake pass to find in the length of the JITed code,
				1939	* to compute ctx->offsets and other context variables
				1940	* needed to compute final JITed code.
				1941	* Also, calculate random starting pointer/start of JITed code
				1942	* which is prefixed by random number of fault instructions.
				1943	*
				1944	* If the first pass fails then there is no chance of it
				1945	* being successful in the second pass, so just fall back
				1946	* to the interpreter.
				1947	*/
				1948	if (build_body(&ctx)) {
				1949	prog = orig_prog;
				1950	goto out_off;
				1951	}
				1952
				1953	tmp_idx = ctx.idx;
				1954	build_prologue(&ctx);
				1955	ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
				1956
				1957	ctx.epilogue_offset = ctx.idx;
				1958
				1959	#if __LINUX_ARM_ARCH__ < 7
				1960	tmp_idx = ctx.idx;
				1961	build_epilogue(&ctx);
				1962	ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
				1963
				1964	ctx.idx += ctx.imm_count;
				1965	if (ctx.imm_count) {
				1966	ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL);
				1967	if (ctx.imms == NULL) {
				1968	prog = orig_prog;
				1969	goto out_off;
				1970	}
				1971	}
				1972	#else
				1973	/* there's nothing about the epilogue on ARMv7 */
				1974	build_epilogue(&ctx);
				1975	#endif
				1976	/* Now we can get the actual image size of the JITed arm code.
				1977	* Currently, we are not considering the THUMB-2 instructions
				1978	* for jit, although it can decrease the size of the image.
				1979	*
				1980	* As each arm instruction is of length 32bit, we are translating
				1981	* number of JITed intructions into the size required to store these
				1982	* JITed code.
				1983	*/
				1984	image_size = sizeof(u32) * ctx.idx;
				1985
				1986	/* Now we know the size of the structure to make */
				1987	header = bpf_jit_binary_alloc(image_size, &image_ptr,
				1988	sizeof(u32), jit_fill_hole);
				1989	/* Not able to allocate memory for the structure then
				1990	* we must fall back to the interpretation
				1991	*/
				1992	if (header == NULL) {
				1993	prog = orig_prog;
				1994	goto out_imms;
				1995	}
				1996
				1997	/* 2.) Actual pass to generate final JIT code */
				1998	ctx.target = (u32 *) image_ptr;
				1999	ctx.idx = 0;
				2000
				2001	build_prologue(&ctx);
				2002
				2003	/* If building the body of the JITed code fails somehow,
				2004	* we fall back to the interpretation.
				2005	*/
				2006	if (build_body(&ctx) < 0) {
				2007	image_ptr = NULL;
				2008	bpf_jit_binary_free(header);
				2009	prog = orig_prog;
				2010	goto out_imms;
				2011	}
				2012	build_epilogue(&ctx);
				2013
				2014	/* 3.) Extra pass to validate JITed Code */
				2015	if (validate_code(&ctx)) {
				2016	image_ptr = NULL;
				2017	bpf_jit_binary_free(header);
				2018	prog = orig_prog;
				2019	goto out_imms;
				2020	}
				2021	flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx));
				2022
				2023	if (bpf_jit_enable > 1)
				2024	/* there are 2 passes here */
				2025	bpf_jit_dump(prog->len, image_size, 2, ctx.target);
				2026
				2027	bpf_jit_binary_lock_ro(header);
				2028	prog->bpf_func = (void *)ctx.target;
				2029	prog->jited = 1;
				2030	prog->jited_len = image_size;
				2031
				2032	out_imms:
				2033	#if __LINUX_ARM_ARCH__ < 7
				2034	if (ctx.imm_count)
				2035	kfree(ctx.imms);
				2036	#endif
				2037	out_off:
				2038	kfree(ctx.offsets);
				2039	out:
				2040	if (tmp_blinded)
				2041	bpf_jit_prog_release_other(prog, prog == orig_prog ?
				2042	tmp : orig_prog);
				2043	return prog;
				2044	}
				2045