Blame - marvell/linux/arch/arm64/lib/memset.S - T108

blob: a9c1c9a01ea906954953c6dce74d4c3e482328da [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/* SPDX-License-Identifier: GPL-2.0-only */
				2	/*
				3	* Copyright (C) 2013 ARM Ltd.
				4	* Copyright (C) 2013 Linaro.
				5	*
				6	* This code is based on glibc cortex strings work originally authored by Linaro
				7	* be found @
				8	*
				9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
				10	* files/head:/src/aarch64/
				11	*/
				12
				13	#include <linux/linkage.h>
				14	#include <asm/assembler.h>
				15	#include <asm/cache.h>
				16
				17	/*
				18	* Fill in the buffer with character c (alignment handled by the hardware)
				19	*
				20	* Parameters:
				21	* x0 - buf
				22	* x1 - c
				23	* x2 - n
				24	* Returns:
				25	* x0 - buf
				26	*/
				27
				28	dstin .req x0
				29	val .req w1
				30	count .req x2
				31	tmp1 .req x3
				32	tmp1w .req w3
				33	tmp2 .req x4
				34	tmp2w .req w4
				35	zva_len_x .req x5
				36	zva_len .req w5
				37	zva_bits_x .req x6
				38
				39	A_l .req x7
				40	A_lw .req w7
				41	dst .req x8
				42	tmp3w .req w9
				43	tmp3 .req x9
				44
				45	SYM_FUNC_START_ALIAS(__memset)
				46	SYM_FUNC_START_WEAK_PI(memset)
				47	mov dst, dstin /* Preserve return value. */
				48	and A_lw, val, #255
				49	orr A_lw, A_lw, A_lw, lsl #8
				50	orr A_lw, A_lw, A_lw, lsl #16
				51	orr A_l, A_l, A_l, lsl #32
				52
				53	cmp count, #15
				54	b.hi .Lover16_proc
				55	/All store maybe are non-aligned../
				56	tbz count, #3, 1f
				57	str A_l, [dst], #8
				58	1:
				59	tbz count, #2, 2f
				60	str A_lw, [dst], #4
				61	2:
				62	tbz count, #1, 3f
				63	strh A_lw, [dst], #2
				64	3:
				65	tbz count, #0, 4f
				66	strb A_lw, [dst]
				67	4:
				68	ret
				69
				70	.Lover16_proc:
				71	/Whether the start address is aligned with 16./
				72	neg tmp2, dst
				73	ands tmp2, tmp2, #15
				74	b.eq .Laligned
				75	/*
				76	* The count is not less than 16, we can use stp to store the start 16 bytes,
				77	* then adjust the dst aligned with 16.This process will make the current
				78	* memory address at alignment boundary.
				79	*/
				80	stp A_l, A_l, [dst] /non-aligned store../
				81	/make the dst aligned../
				82	sub count, count, tmp2
				83	add dst, dst, tmp2
				84
				85	.Laligned:
				86	cbz A_l, .Lzero_mem
				87
				88	.Ltail_maybe_long:
				89	cmp count, #64
				90	b.ge .Lnot_short
				91	.Ltail63:
				92	ands tmp1, count, #0x30
				93	b.eq 3f
				94	cmp tmp1w, #0x20
				95	b.eq 1f
				96	b.lt 2f
				97	stp A_l, A_l, [dst], #16
				98	1:
				99	stp A_l, A_l, [dst], #16
				100	2:
				101	stp A_l, A_l, [dst], #16
				102	/*
				103	* The last store length is less than 16,use stp to write last 16 bytes.
				104	* It will lead some bytes written twice and the access is non-aligned.
				105	*/
				106	3:
				107	ands count, count, #15
				108	cbz count, 4f
				109	add dst, dst, count
				110	stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
				111	4:
				112	ret
				113
				114	/*
				115	* Critical loop. Start at a new cache line boundary. Assuming
				116	* 64 bytes per line, this ensures the entire loop is in one line.
				117	*/
				118	.p2align L1_CACHE_SHIFT
				119	.Lnot_short:
				120	sub dst, dst, #16/* Pre-bias. */
				121	sub count, count, #64
				122	1:
				123	stp A_l, A_l, [dst, #16]
				124	stp A_l, A_l, [dst, #32]
				125	stp A_l, A_l, [dst, #48]
				126	stp A_l, A_l, [dst, #64]!
				127	subs count, count, #64
				128	b.ge 1b
				129	tst count, #0x3f
				130	add dst, dst, #16
				131	b.ne .Ltail63
				132	.Lexitfunc:
				133	ret
				134
				135	/*
				136	* For zeroing memory, check to see if we can use the ZVA feature to
				137	* zero entire 'cache' lines.
				138	*/
				139	.Lzero_mem:
				140	cmp count, #63
				141	b.le .Ltail63
				142	/*
				143	* For zeroing small amounts of memory, it's not worth setting up
				144	* the line-clear code.
				145	*/
				146	cmp count, #128
				147	b.lt .Lnot_short /count is at least 128 bytes/
				148
				149	mrs tmp1, dczid_el0
				150	tbnz tmp1, #4, .Lnot_short
				151	mov tmp3w, #4
				152	and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
				153	lsl zva_len, tmp3w, zva_len
				154
				155	ands tmp3w, zva_len, #63
				156	/*
				157	* ensure the zva_len is not less than 64.
				158	* It is not meaningful to use ZVA if the block size is less than 64.
				159	*/
				160	b.ne .Lnot_short
				161	.Lzero_by_line:
				162	/*
				163	* Compute how far we need to go to become suitably aligned. We're
				164	* already at quad-word alignment.
				165	*/
				166	cmp count, zva_len_x
				167	b.lt .Lnot_short /* Not enough to reach alignment. */
				168	sub zva_bits_x, zva_len_x, #1
				169	neg tmp2, dst
				170	ands tmp2, tmp2, zva_bits_x
				171	b.eq 2f /* Already aligned. */
				172	/* Not aligned, check that there's enough to copy after alignment.*/
				173	sub tmp1, count, tmp2
				174	/*
				175	* grantee the remain length to be ZVA is bigger than 64,
				176	* avoid to make the 2f's process over mem range.*/
				177	cmp tmp1, #64
				178	ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
				179	b.lt .Lnot_short
				180	/*
				181	* We know that there's at least 64 bytes to zero and that it's safe
				182	* to overrun by 64 bytes.
				183	*/
				184	mov count, tmp1
				185	1:
				186	stp A_l, A_l, [dst]
				187	stp A_l, A_l, [dst, #16]
				188	stp A_l, A_l, [dst, #32]
				189	subs tmp2, tmp2, #64
				190	stp A_l, A_l, [dst, #48]
				191	add dst, dst, #64
				192	b.ge 1b
				193	/* We've overrun a bit, so adjust dst downwards.*/
				194	add dst, dst, tmp2
				195	2:
				196	sub count, count, zva_len_x
				197	3:
				198	dc zva, dst
				199	add dst, dst, zva_len_x
				200	subs count, count, zva_len_x
				201	b.ge 3b
				202	ands count, count, zva_bits_x
				203	b.ne .Ltail_maybe_long
				204	ret
				205	SYM_FUNC_END_PI(memset)
				206	EXPORT_SYMBOL(memset)
				207	SYM_FUNC_END_ALIAS(__memset)
				208	EXPORT_SYMBOL(__memset)