Blame - ap/libc/glibc/glibc-2.23/sysdeps/aarch64/strchrnul.S - T106_DC

blob: a624c8d2efb0dcd369e6ad06247184cd55cb6d09 [file] [log] [blame]

xf.li	bdd93d5	2023-05-12 07:10:14 -0700	[diff] [blame]	1	/* strchrnul - find a character or nul in a string
				2
				3	Copyright (C) 2014-2016 Free Software Foundation, Inc.
				4
				5	This file is part of the GNU C Library.
				6
				7	The GNU C Library is free software; you can redistribute it and/or
				8	modify it under the terms of the GNU Lesser General Public
				9	License as published by the Free Software Foundation; either
				10	version 2.1 of the License, or (at your option) any later version.
				11
				12	The GNU C Library is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				15	Lesser General Public License for more details.
				16
				17	You should have received a copy of the GNU Lesser General Public
				18	License along with the GNU C Library. If not, see
				19	<http://www.gnu.org/licenses/>. */
				20
				21	#include <sysdep.h>
				22
				23	/* Assumptions:
				24	*
				25	* ARMv8-a, AArch64
				26	* Neon Available.
				27	*/
				28
				29	/* Arguments and results. */
				30	#define srcin x0
				31	#define chrin w1
				32
				33	#define result x0
				34
				35	/* Locals and temporaries. */
				36
				37	#define src x2
				38	#define tmp1 x3
				39	#define wtmp2 w4
				40	#define tmp3 x5
				41
				42	#define vrepchr v0
				43	#define vdata1 v1
				44	#define vdata2 v2
				45	#define vhas_nul1 v3
				46	#define vhas_nul2 v4
				47	#define vhas_chr1 v5
				48	#define vhas_chr2 v6
				49	#define vrepmask v7
				50	#define vend1 v16
				51
				52	/* Core algorithm.
				53
				54	For each 32-byte hunk we calculate a 64-bit syndrome value, with
				55	two bits per byte (LSB is always in bits 0 and 1, for both big
				56	and little-endian systems). For each tuple, bit 0 is set iff
				57	the relevant byte matched the requested character or nul. Since the
				58	bits in the syndrome reflect exactly the order in which things occur
				59	in the original string a count_trailing_zeros() operation will
				60	identify exactly which byte is causing the termination. */
				61
				62	ENTRY (__strchrnul)
				63	/* Magic constant 0x40100401 to allow us to identify which lane
				64	matches the termination condition. */
				65	mov wtmp2, #0x0401
				66	movk wtmp2, #0x4010, lsl #16
				67	dup vrepchr.16b, chrin
				68	bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
				69	dup vrepmask.4s, wtmp2
				70	ands tmp1, srcin, #31
				71	b.eq L(loop)
				72
				73	/* Input string is not 32-byte aligned. Rather than forcing
				74	the padding bytes to a safe value, we calculate the syndrome
				75	for all the bytes, but then mask off those bits of the
				76	syndrome that are related to the padding. */
				77	ld1 {vdata1.16b, vdata2.16b}, [src], #32
				78	neg tmp1, tmp1
				79	cmeq vhas_nul1.16b, vdata1.16b, #0
				80	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
				81	cmeq vhas_nul2.16b, vdata2.16b, #0
				82	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
				83	orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
				84	orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
				85	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
				86	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
				87	lsl tmp1, tmp1, #1
				88	addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
				89	mov tmp3, #~0
				90	addp vend1.16b, vend1.16b, vend1.16b // 128->64
				91	lsr tmp1, tmp3, tmp1
				92
				93	mov tmp3, vend1.2d[0]
				94	bic tmp1, tmp3, tmp1 // Mask padding bits.
				95	cbnz tmp1, L(tail)
				96
				97	L(loop):
				98	ld1 {vdata1.16b, vdata2.16b}, [src], #32
				99	cmeq vhas_nul1.16b, vdata1.16b, #0
				100	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
				101	cmeq vhas_nul2.16b, vdata2.16b, #0
				102	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
				103	/* Use a fast check for the termination condition. */
				104	orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
				105	orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
				106	orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
				107	addp vend1.2d, vend1.2d, vend1.2d
				108	mov tmp1, vend1.2d[0]
				109	cbz tmp1, L(loop)
				110
				111	/* Termination condition found. Now need to establish exactly why
				112	we terminated. */
				113	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
				114	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
				115	addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
				116	addp vend1.16b, vend1.16b, vend1.16b // 128->64
				117
				118	mov tmp1, vend1.2d[0]
				119	L(tail):
				120	/* Count the trailing zeros, by bit reversing... */
				121	rbit tmp1, tmp1
				122	/* Re-bias source. */
				123	sub src, src, #32
				124	clz tmp1, tmp1 /* ... and counting the leading zeros. */
				125	/* tmp1 is twice the offset into the fragment. */
				126	add result, src, tmp1, lsr #1
				127	ret
				128
				129	END(__strchrnul)
				130	weak_alias (__strchrnul, strchrnul)