Blame - ap/libc/glibc/glibc-2.22/sysdeps/aarch64/memset.S - T106_DC

blob: 816640a129a3b0d5cfca48177568e46241857793 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
				2
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library. If not, see
				17	<http://www.gnu.org/licenses/>. */
				18
				19	/* Assumptions:
				20	*
				21	* ARMv8-a, AArch64
				22	* Unaligned accesses
				23	*
				24	*/
				25
				26	#include <sysdep.h>
				27
				28	/* By default we assume that the DC instruction can be used to zero
				29	data blocks more efficiently. In some circumstances this might be
				30	unsafe, for example in an asymmetric multiprocessor environment with
				31	different DC clear lengths (neither the upper nor lower lengths are
				32	safe to use). The feature can be disabled by defining DONT_USE_DC.
				33
				34	If code may be run in a virtualized environment, then define
				35	MAYBE_VIRT. This will cause the code to cache the system register
				36	values rather than re-reading them each call. */
				37
				38	#define dstin x0
				39	#define val w1
				40	#define count x2
				41	#define tmp1 x3
				42	#define tmp1w w3
				43	#define tmp2 x4
				44	#define tmp2w w4
				45	#define zva_len_x x5
				46	#define zva_len w5
				47	#define zva_bits_x x6
				48
				49	#define A_l x7
				50	#define A_lw w7
				51	#define dst x8
				52	#define tmp3w w9
				53
				54	ENTRY_ALIGN (__memset, 6)
				55
				56	mov dst, dstin /* Preserve return value. */
				57	ands A_lw, val, #255
				58	#ifndef DONT_USE_DC
				59	b.eq L(zero_mem)
				60	#endif
				61	orr A_lw, A_lw, A_lw, lsl #8
				62	orr A_lw, A_lw, A_lw, lsl #16
				63	orr A_l, A_l, A_l, lsl #32
				64	L(tail_maybe_long):
				65	cmp count, #64
				66	b.ge L(not_short)
				67	L(tail_maybe_tiny):
				68	cmp count, #15
				69	b.le L(tail15tiny)
				70	L(tail63):
				71	ands tmp1, count, #0x30
				72	b.eq L(tail15)
				73	add dst, dst, tmp1
				74	cmp tmp1w, #0x20
				75	b.eq 1f
				76	b.lt 2f
				77	stp A_l, A_l, [dst, #-48]
				78	1:
				79	stp A_l, A_l, [dst, #-32]
				80	2:
				81	stp A_l, A_l, [dst, #-16]
				82
				83	L(tail15):
				84	and count, count, #15
				85	add dst, dst, count
				86	stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
				87	RET
				88
				89	L(tail15tiny):
				90	/* Set up to 15 bytes. Does not assume earlier memory
				91	being set. */
				92	tbz count, #3, 1f
				93	str A_l, [dst], #8
				94	1:
				95	tbz count, #2, 1f
				96	str A_lw, [dst], #4
				97	1:
				98	tbz count, #1, 1f
				99	strh A_lw, [dst], #2
				100	1:
				101	tbz count, #0, 1f
				102	strb A_lw, [dst]
				103	1:
				104	RET
				105
				106	/* Critical loop. Start at a new cache line boundary. Assuming
				107	* 64 bytes per line, this ensures the entire loop is in one line. */
				108	.p2align 6
				109	L(not_short):
				110	neg tmp2, dst
				111	ands tmp2, tmp2, #15
				112	b.eq 2f
				113	/* Bring DST to 128-bit (16-byte) alignment. We know that there's
				114	* more than that to set, so we simply store 16 bytes and advance by
				115	* the amount required to reach alignment. */
				116	sub count, count, tmp2
				117	stp A_l, A_l, [dst]
				118	add dst, dst, tmp2
				119	/* There may be less than 63 bytes to go now. */
				120	cmp count, #63
				121	b.le L(tail63)
				122	2:
				123	sub dst, dst, #16 /* Pre-bias. */
				124	sub count, count, #64
				125	1:
				126	stp A_l, A_l, [dst, #16]
				127	stp A_l, A_l, [dst, #32]
				128	stp A_l, A_l, [dst, #48]
				129	stp A_l, A_l, [dst, #64]!
				130	subs count, count, #64
				131	b.ge 1b
				132	tst count, #0x3f
				133	add dst, dst, #16
				134	b.ne L(tail63)
				135	RET
				136
				137	#ifndef DONT_USE_DC
				138	/* For zeroing memory, check to see if we can use the ZVA feature to
				139	* zero entire 'cache' lines. */
				140	L(zero_mem):
				141	mov A_l, #0
				142	cmp count, #63
				143	b.le L(tail_maybe_tiny)
				144	neg tmp2, dst
				145	ands tmp2, tmp2, #15
				146	b.eq 1f
				147	sub count, count, tmp2
				148	stp A_l, A_l, [dst]
				149	add dst, dst, tmp2
				150	cmp count, #63
				151	b.le L(tail63)
				152	1:
				153	/* For zeroing small amounts of memory, it's not worth setting up
				154	* the line-clear code. */
				155	cmp count, #128
				156	b.lt L(not_short)
				157	#ifdef MAYBE_VIRT
				158	/* For efficiency when virtualized, we cache the ZVA capability. */
				159	adrp tmp2, L(cache_clear)
				160	ldr zva_len, [tmp2, #:lo12:L(cache_clear)]
				161	tbnz zva_len, #31, L(not_short)
				162	cbnz zva_len, L(zero_by_line)
				163	mrs tmp1, dczid_el0
				164	tbz tmp1, #4, 1f
				165	/* ZVA not available. Remember this for next time. */
				166	mov zva_len, #~0
				167	str zva_len, [tmp2, #:lo12:L(cache_clear)]
				168	b L(not_short)
				169	1:
				170	mov tmp3w, #4
				171	and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
				172	lsl zva_len, tmp3w, zva_len
				173	str zva_len, [tmp2, #:lo12:L(cache_clear)]
				174	#else
				175	mrs tmp1, dczid_el0
				176	tbnz tmp1, #4, L(not_short)
				177	mov tmp3w, #4
				178	and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
				179	lsl zva_len, tmp3w, zva_len
				180	#endif
				181
				182	L(zero_by_line):
				183	/* Compute how far we need to go to become suitably aligned. We're
				184	* already at quad-word alignment. */
				185	cmp count, zva_len_x
				186	b.lt L(not_short) /* Not enough to reach alignment. */
				187	sub zva_bits_x, zva_len_x, #1
				188	neg tmp2, dst
				189	ands tmp2, tmp2, zva_bits_x
				190	b.eq 1f /* Already aligned. */
				191	/* Not aligned, check that there's enough to copy after alignment. */
				192	sub tmp1, count, tmp2
				193	cmp tmp1, #64
				194	ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
				195	b.lt L(not_short)
				196	/* We know that there's at least 64 bytes to zero and that it's safe
				197	* to overrun by 64 bytes. */
				198	mov count, tmp1
				199	2:
				200	stp A_l, A_l, [dst]
				201	stp A_l, A_l, [dst, #16]
				202	stp A_l, A_l, [dst, #32]
				203	subs tmp2, tmp2, #64
				204	stp A_l, A_l, [dst, #48]
				205	add dst, dst, #64
				206	b.ge 2b
				207	/* We've overrun a bit, so adjust dst downwards. */
				208	add dst, dst, tmp2
				209	1:
				210	sub count, count, zva_len_x
				211	3:
				212	dc zva, dst
				213	add dst, dst, zva_len_x
				214	subs count, count, zva_len_x
				215	b.ge 3b
				216	ands count, count, zva_bits_x
				217	b.ne L(tail_maybe_long)
				218	RET
				219	#ifdef MAYBE_VIRT
				220	.bss
				221	.p2align 2
				222	L(cache_clear):
				223	.space 4
				224	#endif
				225	#endif /* DONT_USE_DC */
				226
				227	END (__memset)
				228	weak_alias (__memset, memset)
				229	libc_hidden_builtin_def (memset)