Blame - ap/libc/glibc/glibc-2.23/sysdeps/aarch64/strcpy.S - T106_DC

blob: 0694199372c6ffcc91f417f3fc01d7a64d60f8be [file] [log] [blame]

xf.li	bdd93d5	2023-05-12 07:10:14 -0700	[diff] [blame]	1	/* strcpy/stpcpy - copy a string returning pointer to start/end.
				2	Copyright (C) 2013-2016 Free Software Foundation, Inc.
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library; if not, see
				17	<http://www.gnu.org/licenses/>. */
				18
				19	/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
				20
				21	To test the page crossing code path more thoroughly, compile with
				22	-DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
				23	the slower entry path. This option is not intended for production use. */
				24
				25	#include <sysdep.h>
				26
				27	/* Assumptions:
				28	*
				29	* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
				30	*/
				31
				32	/* Arguments and results. */
				33	#define dstin x0
				34	#define srcin x1
				35
				36	/* Locals and temporaries. */
				37	#define src x2
				38	#define dst x3
				39	#define data1 x4
				40	#define data1w w4
				41	#define data2 x5
				42	#define data2w w5
				43	#define has_nul1 x6
				44	#define has_nul2 x7
				45	#define tmp1 x8
				46	#define tmp2 x9
				47	#define tmp3 x10
				48	#define tmp4 x11
				49	#define zeroones x12
				50	#define data1a x13
				51	#define data2a x14
				52	#define pos x15
				53	#define len x16
				54	#define to_align x17
				55
				56	#ifdef BUILD_STPCPY
				57	#define STRCPY __stpcpy
				58	#else
				59	#define STRCPY strcpy
				60	#endif
				61
				62	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
				63	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
				64	can be done in parallel across the entire word. */
				65
				66	#define REP8_01 0x0101010101010101
				67	#define REP8_7f 0x7f7f7f7f7f7f7f7f
				68	#define REP8_80 0x8080808080808080
				69
				70	/* AArch64 systems have a minimum page size of 4k. We can do a quick
				71	page size check for crossing this boundary on entry and if we
				72	do not, then we can short-circuit much of the entry code. We
				73	expect early page-crossing strings to be rare (probability of
				74	16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
				75	predictable, even with random strings.
				76
				77	We don't bother checking for larger page sizes, the cost of setting
				78	up the correct page size is just not worth the extra gain from
				79	a small reduction in the cases taking the slow path. Note that
				80	we only care about whether the first fetch, which may be
				81	misaligned, crosses a page boundary - after that we move to aligned
				82	fetches for the remainder of the string. */
				83
				84	#ifdef STRCPY_TEST_PAGE_CROSS
				85	/* Make everything that isn't Qword aligned look like a page cross. */
				86	#define MIN_PAGE_P2 4
				87	#else
				88	#define MIN_PAGE_P2 12
				89	#endif
				90
				91	#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
				92
				93	ENTRY_ALIGN (STRCPY, 6)
				94	/* For moderately short strings, the fastest way to do the copy is to
				95	calculate the length of the string in the same way as strlen, then
				96	essentially do a memcpy of the result. This avoids the need for
				97	multiple byte copies and further means that by the time we
				98	reach the bulk copy loop we know we can always use DWord
				99	accesses. We expect strcpy to rarely be called repeatedly
				100	with the same source string, so branch prediction is likely to
				101	always be difficult - we mitigate against this by preferring
				102	conditional select operations over branches whenever this is
				103	feasible. */
				104	and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
				105	mov zeroones, #REP8_01
				106	and to_align, srcin, #15
				107	cmp tmp2, #(MIN_PAGE_SIZE - 16)
				108	neg tmp1, to_align
				109	/* The first fetch will straddle a (possible) page boundary iff
				110	srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
				111	aligned string will never fail the page align check, so will
				112	always take the fast path. */
				113	b.gt L(page_cross)
				114
				115	L(page_cross_ok):
				116	ldp data1, data2, [srcin]
				117	#ifdef __AARCH64EB__
				118	/* Because we expect the end to be found within 16 characters
				119	(profiling shows this is the most common case), it's worth
				120	swapping the bytes now to save having to recalculate the
				121	termination syndrome later. We preserve data1 and data2
				122	so that we can re-use the values later on. */
				123	rev tmp2, data1
				124	sub tmp1, tmp2, zeroones
				125	orr tmp2, tmp2, #REP8_7f
				126	bics has_nul1, tmp1, tmp2
				127	b.ne L(fp_le8)
				128	rev tmp4, data2
				129	sub tmp3, tmp4, zeroones
				130	orr tmp4, tmp4, #REP8_7f
				131	#else
				132	sub tmp1, data1, zeroones
				133	orr tmp2, data1, #REP8_7f
				134	bics has_nul1, tmp1, tmp2
				135	b.ne L(fp_le8)
				136	sub tmp3, data2, zeroones
				137	orr tmp4, data2, #REP8_7f
				138	#endif
				139	bics has_nul2, tmp3, tmp4
				140	b.eq L(bulk_entry)
				141
				142	/* The string is short (<=16 bytes). We don't know exactly how
				143	short though, yet. Work out the exact length so that we can
				144	quickly select the optimal copy strategy. */
				145	L(fp_gt8):
				146	rev has_nul2, has_nul2
				147	clz pos, has_nul2
				148	mov tmp2, #56
				149	add dst, dstin, pos, lsr #3 /* Bits to bytes. */
				150	sub pos, tmp2, pos
				151	#ifdef __AARCH64EB__
				152	lsr data2, data2, pos
				153	#else
				154	lsl data2, data2, pos
				155	#endif
				156	str data2, [dst, #1]
				157	str data1, [dstin]
				158	#ifdef BUILD_STPCPY
				159	add dstin, dst, #8
				160	#endif
				161	ret
				162
				163	L(fp_le8):
				164	rev has_nul1, has_nul1
				165	clz pos, has_nul1
				166	add dst, dstin, pos, lsr #3 /* Bits to bytes. */
				167	subs tmp2, pos, #24 /* Pos in bits. */
				168	b.lt L(fp_lt4)
				169	#ifdef __AARCH64EB__
				170	mov tmp2, #56
				171	sub pos, tmp2, pos
				172	lsr data2, data1, pos
				173	lsr data1, data1, #32
				174	#else
				175	lsr data2, data1, tmp2
				176	#endif
				177	/* 4->7 bytes to copy. */
				178	str data2w, [dst, #-3]
				179	str data1w, [dstin]
				180	#ifdef BUILD_STPCPY
				181	mov dstin, dst
				182	#endif
				183	ret
				184	L(fp_lt4):
				185	cbz pos, L(fp_lt2)
				186	/* 2->3 bytes to copy. */
				187	#ifdef __AARCH64EB__
				188	lsr data1, data1, #48
				189	#endif
				190	strh data1w, [dstin]
				191	/* Fall-through, one byte (max) to go. */
				192	L(fp_lt2):
				193	/* Null-terminated string. Last character must be zero! */
				194	strb wzr, [dst]
				195	#ifdef BUILD_STPCPY
				196	mov dstin, dst
				197	#endif
				198	ret
				199
				200	.p2align 6
				201	/* Aligning here ensures that the entry code and main loop all lies
				202	within one 64-byte cache line. */
				203	L(bulk_entry):
				204	sub to_align, to_align, #16
				205	stp data1, data2, [dstin]
				206	sub src, srcin, to_align
				207	sub dst, dstin, to_align
				208	b L(entry_no_page_cross)
				209
				210	/* The inner loop deals with two Dwords at a time. This has a
				211	slightly higher start-up cost, but we should win quite quickly,
				212	especially on cores with a high number of issue slots per
				213	cycle, as we get much better parallelism out of the operations. */
				214	L(main_loop):
				215	stp data1, data2, [dst], #16
				216	L(entry_no_page_cross):
				217	ldp data1, data2, [src], #16
				218	sub tmp1, data1, zeroones
				219	orr tmp2, data1, #REP8_7f
				220	sub tmp3, data2, zeroones
				221	orr tmp4, data2, #REP8_7f
				222	bic has_nul1, tmp1, tmp2
				223	bics has_nul2, tmp3, tmp4
				224	ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
				225	b.eq L(main_loop)
				226
				227	/* Since we know we are copying at least 16 bytes, the fastest way
				228	to deal with the tail is to determine the location of the
				229	trailing NUL, then (re)copy the 16 bytes leading up to that. */
				230	cmp has_nul1, #0
				231	#ifdef __AARCH64EB__
				232	/* For big-endian, carry propagation (if the final byte in the
				233	string is 0x01) means we cannot use has_nul directly. The
				234	easiest way to get the correct byte is to byte-swap the data
				235	and calculate the syndrome a second time. */
				236	csel data1, data1, data2, ne
				237	rev data1, data1
				238	sub tmp1, data1, zeroones
				239	orr tmp2, data1, #REP8_7f
				240	bic has_nul1, tmp1, tmp2
				241	#else
				242	csel has_nul1, has_nul1, has_nul2, ne
				243	#endif
				244	rev has_nul1, has_nul1
				245	clz pos, has_nul1
				246	add tmp1, pos, #72
				247	add pos, pos, #8
				248	csel pos, pos, tmp1, ne
				249	add src, src, pos, lsr #3
				250	add dst, dst, pos, lsr #3
				251	ldp data1, data2, [src, #-32]
				252	stp data1, data2, [dst, #-16]
				253	#ifdef BUILD_STPCPY
				254	sub dstin, dst, #1
				255	#endif
				256	ret
				257
				258	L(page_cross):
				259	bic src, srcin, #15
				260	/* Start by loading two words at [srcin & ~15], then forcing the
				261	bytes that precede srcin to 0xff. This means they never look
				262	like termination bytes. */
				263	ldp data1, data2, [src]
				264	lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
				265	tst to_align, #7
				266	csetm tmp2, ne
				267	#ifdef __AARCH64EB__
				268	lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
				269	#else
				270	lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
				271	#endif
				272	orr data1, data1, tmp2
				273	orr data2a, data2, tmp2
				274	cmp to_align, #8
				275	csinv data1, data1, xzr, lt
				276	csel data2, data2, data2a, lt
				277	sub tmp1, data1, zeroones
				278	orr tmp2, data1, #REP8_7f
				279	sub tmp3, data2, zeroones
				280	orr tmp4, data2, #REP8_7f
				281	bic has_nul1, tmp1, tmp2
				282	bics has_nul2, tmp3, tmp4
				283	ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
				284	b.eq L(page_cross_ok)
				285	/* We now need to make data1 and data2 look like they've been
				286	loaded directly from srcin. Do a rotate on the 128-bit value. */
				287	lsl tmp1, to_align, #3 /* Bytes->bits. */
				288	neg tmp2, to_align, lsl #3
				289	#ifdef __AARCH64EB__
				290	lsl data1a, data1, tmp1
				291	lsr tmp4, data2, tmp2
				292	lsl data2, data2, tmp1
				293	orr tmp4, tmp4, data1a
				294	cmp to_align, #8
				295	csel data1, tmp4, data2, lt
				296	rev tmp2, data1
				297	rev tmp4, data2
				298	sub tmp1, tmp2, zeroones
				299	orr tmp2, tmp2, #REP8_7f
				300	sub tmp3, tmp4, zeroones
				301	orr tmp4, tmp4, #REP8_7f
				302	#else
				303	lsr data1a, data1, tmp1
				304	lsl tmp4, data2, tmp2
				305	lsr data2, data2, tmp1
				306	orr tmp4, tmp4, data1a
				307	cmp to_align, #8
				308	csel data1, tmp4, data2, lt
				309	sub tmp1, data1, zeroones
				310	orr tmp2, data1, #REP8_7f
				311	sub tmp3, data2, zeroones
				312	orr tmp4, data2, #REP8_7f
				313	#endif
				314	bic has_nul1, tmp1, tmp2
				315	cbnz has_nul1, L(fp_le8)
				316	bic has_nul2, tmp3, tmp4
				317	b L(fp_gt8)
				318	END (STRCPY)
				319
				320	#ifdef BUILD_STPCPY
				321	weak_alias (__stpcpy, stpcpy)
				322	libc_hidden_def (__stpcpy)
				323	libc_hidden_builtin_def (stpcpy)
				324	#else
				325	libc_hidden_builtin_def (strcpy)
				326	#endif