Blame - ap/libc/glibc/glibc-2.22/sysdeps/aarch64/strncmp.S - T106_DC

blob: 483b6fdbca9a703dd5cbd087237b726b2f273fce [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
				2
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library. If not, see
				17	<http://www.gnu.org/licenses/>. */
				18
				19	#include <sysdep.h>
				20
				21	/* Assumptions:
				22	*
				23	* ARMv8-a, AArch64
				24	*/
				25
				26	#define REP8_01 0x0101010101010101
				27	#define REP8_7f 0x7f7f7f7f7f7f7f7f
				28	#define REP8_80 0x8080808080808080
				29
				30	/* Parameters and result. */
				31	#define src1 x0
				32	#define src2 x1
				33	#define limit x2
				34	#define result x0
				35
				36	/* Internal variables. */
				37	#define data1 x3
				38	#define data1w w3
				39	#define data2 x4
				40	#define data2w w4
				41	#define has_nul x5
				42	#define diff x6
				43	#define syndrome x7
				44	#define tmp1 x8
				45	#define tmp2 x9
				46	#define tmp3 x10
				47	#define zeroones x11
				48	#define pos x12
				49	#define limit_wd x13
				50	#define mask x14
				51	#define endloop x15
				52
				53	ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
				54	cbz limit, L(ret0)
				55	eor tmp1, src1, src2
				56	mov zeroones, #REP8_01
				57	tst tmp1, #7
				58	b.ne L(misaligned8)
				59	ands tmp1, src1, #7
				60	b.ne L(mutual_align)
				61	/* Calculate the number of full and partial words -1. */
				62	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
				63	lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
				64
				65	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
				66	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
				67	can be done in parallel across the entire word. */
				68	/* Start of performance-critical section -- one 64B cache line. */
				69	L(loop_aligned):
				70	ldr data1, [src1], #8
				71	ldr data2, [src2], #8
				72	L(start_realigned):
				73	subs limit_wd, limit_wd, #1
				74	sub tmp1, data1, zeroones
				75	orr tmp2, data1, #REP8_7f
				76	eor diff, data1, data2 /* Non-zero if differences found. */
				77	csinv endloop, diff, xzr, pl /* Last Dword or differences. */
				78	bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
				79	ccmp endloop, #0, #0, eq
				80	b.eq L(loop_aligned)
				81	/* End of performance-critical section -- one 64B cache line. */
				82
				83	/* Not reached the limit, must have found the end or a diff. */
				84	tbz limit_wd, #63, L(not_limit)
				85
				86	/* Limit % 8 == 0 => all bytes significant. */
				87	ands limit, limit, #7
				88	b.eq L(not_limit)
				89
				90	lsl limit, limit, #3 /* Bits -> bytes. */
				91	mov mask, #~0
				92	#ifdef __AARCH64EB__
				93	lsr mask, mask, limit
				94	#else
				95	lsl mask, mask, limit
				96	#endif
				97	bic data1, data1, mask
				98	bic data2, data2, mask
				99
				100	/* Make sure that the NUL byte is marked in the syndrome. */
				101	orr has_nul, has_nul, mask
				102
				103	L(not_limit):
				104	orr syndrome, diff, has_nul
				105
				106	#ifndef __AARCH64EB__
				107	rev syndrome, syndrome
				108	rev data1, data1
				109	/* The MS-non-zero bit of the syndrome marks either the first bit
				110	that is different, or the top bit of the first zero byte.
				111	Shifting left now will bring the critical information into the
				112	top bits. */
				113	clz pos, syndrome
				114	rev data2, data2
				115	lsl data1, data1, pos
				116	lsl data2, data2, pos
				117	/* But we need to zero-extend (char is unsigned) the value and then
				118	perform a signed 32-bit subtraction. */
				119	lsr data1, data1, #56
				120	sub result, data1, data2, lsr #56
				121	RET
				122	#else
				123	/* For big-endian we cannot use the trick with the syndrome value
				124	as carry-propagation can corrupt the upper bits if the trailing
				125	bytes in the string contain 0x01. */
				126	/* However, if there is no NUL byte in the dword, we can generate
				127	the result directly. We can't just subtract the bytes as the
				128	MSB might be significant. */
				129	cbnz has_nul, 1f
				130	cmp data1, data2
				131	cset result, ne
				132	cneg result, result, lo
				133	RET
				134	1:
				135	/* Re-compute the NUL-byte detection, using a byte-reversed value. */
				136	rev tmp3, data1
				137	sub tmp1, tmp3, zeroones
				138	orr tmp2, tmp3, #REP8_7f
				139	bic has_nul, tmp1, tmp2
				140	rev has_nul, has_nul
				141	orr syndrome, diff, has_nul
				142	clz pos, syndrome
				143	/* The MS-non-zero bit of the syndrome marks either the first bit
				144	that is different, or the top bit of the first zero byte.
				145	Shifting left now will bring the critical information into the
				146	top bits. */
				147	lsl data1, data1, pos
				148	lsl data2, data2, pos
				149	/* But we need to zero-extend (char is unsigned) the value and then
				150	perform a signed 32-bit subtraction. */
				151	lsr data1, data1, #56
				152	sub result, data1, data2, lsr #56
				153	RET
				154	#endif
				155
				156	L(mutual_align):
				157	/* Sources are mutually aligned, but are not currently at an
				158	alignment boundary. Round down the addresses and then mask off
				159	the bytes that precede the start point.
				160	We also need to adjust the limit calculations, but without
				161	overflowing if the limit is near ULONG_MAX. */
				162	bic src1, src1, #7
				163	bic src2, src2, #7
				164	ldr data1, [src1], #8
				165	neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
				166	ldr data2, [src2], #8
				167	mov tmp2, #~0
				168	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
				169	#ifdef __AARCH64EB__
				170	/* Big-endian. Early bytes are at MSB. */
				171	lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
				172	#else
				173	/* Little-endian. Early bytes are at LSB. */
				174	lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
				175	#endif
				176	and tmp3, limit_wd, #7
				177	lsr limit_wd, limit_wd, #3
				178	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
				179	add limit, limit, tmp1
				180	add tmp3, tmp3, tmp1
				181	orr data1, data1, tmp2
				182	orr data2, data2, tmp2
				183	add limit_wd, limit_wd, tmp3, lsr #3
				184	b L(start_realigned)
				185
				186	L(ret0):
				187	mov result, #0
				188	RET
				189
				190	.p2align 6
				191	L(misaligned8):
				192	sub limit, limit, #1
				193	1:
				194	/* Perhaps we can do better than this. */
				195	ldrb data1w, [src1], #1
				196	ldrb data2w, [src2], #1
				197	subs limit, limit, #1
				198	ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
				199	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
				200	b.eq 1b
				201	sub result, data1, data2
				202	RET
				203	END (strncmp)
				204	libc_hidden_builtin_def (strncmp)