| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* strchrnul - find a character or nul in a string | 
|  | 2 |  | 
|  | 3 | Copyright (C) 2014-2016 Free Software Foundation, Inc. | 
|  | 4 |  | 
|  | 5 | This file is part of the GNU C Library. | 
|  | 6 |  | 
|  | 7 | The GNU C Library is free software; you can redistribute it and/or | 
|  | 8 | modify it under the terms of the GNU Lesser General Public | 
|  | 9 | License as published by the Free Software Foundation; either | 
|  | 10 | version 2.1 of the License, or (at your option) any later version. | 
|  | 11 |  | 
|  | 12 | The GNU C Library is distributed in the hope that it will be useful, | 
|  | 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 15 | Lesser General Public License for more details. | 
|  | 16 |  | 
|  | 17 | You should have received a copy of the GNU Lesser General Public | 
|  | 18 | License along with the GNU C Library.  If not, see | 
|  | 19 | <http://www.gnu.org/licenses/>.  */ | 
|  | 20 |  | 
|  | 21 | #include <sysdep.h> | 
|  | 22 |  | 
|  | 23 | /* Assumptions: | 
|  | 24 | * | 
|  | 25 | * ARMv8-a, AArch64 | 
|  | 26 | * Neon Available. | 
|  | 27 | */ | 
|  | 28 |  | 
|  | 29 | /* Arguments and results.  */ | 
|  | 30 | #define srcin		x0 | 
|  | 31 | #define chrin		w1 | 
|  | 32 |  | 
|  | 33 | #define result		x0 | 
|  | 34 |  | 
|  | 35 | /* Locals and temporaries.  */ | 
|  | 36 |  | 
|  | 37 | #define src		x2 | 
|  | 38 | #define tmp1		x3 | 
|  | 39 | #define wtmp2		w4 | 
|  | 40 | #define tmp3		x5 | 
|  | 41 |  | 
|  | 42 | #define vrepchr		v0 | 
|  | 43 | #define vdata1		v1 | 
|  | 44 | #define vdata2		v2 | 
|  | 45 | #define vhas_nul1	v3 | 
|  | 46 | #define vhas_nul2	v4 | 
|  | 47 | #define vhas_chr1	v5 | 
|  | 48 | #define vhas_chr2	v6 | 
|  | 49 | #define vrepmask	v7 | 
|  | 50 | #define vend1		v16 | 
|  | 51 |  | 
|  | 52 | /* Core algorithm. | 
|  | 53 |  | 
|  | 54 | For each 32-byte hunk we calculate a 64-bit syndrome value, with | 
|  | 55 | two bits per byte (LSB is always in bits 0 and 1, for both big | 
|  | 56 | and little-endian systems).  For each tuple, bit 0 is set iff | 
|  | 57 | the relevant byte matched the requested character or nul.  Since the | 
|  | 58 | bits in the syndrome reflect exactly the order in which things occur | 
|  | 59 | in the original string a count_trailing_zeros() operation will | 
|  | 60 | identify exactly which byte is causing the termination.  */ | 
|  | 61 |  | 
|  | 62 | ENTRY (__strchrnul) | 
|  | 63 | /* Magic constant 0x40100401 to allow us to identify which lane | 
|  | 64 | matches the termination condition.  */ | 
|  | 65 | mov	wtmp2, #0x0401 | 
|  | 66 | movk	wtmp2, #0x4010, lsl #16 | 
|  | 67 | dup	vrepchr.16b, chrin | 
|  | 68 | bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */ | 
|  | 69 | dup	vrepmask.4s, wtmp2 | 
|  | 70 | ands	tmp1, srcin, #31 | 
|  | 71 | b.eq	L(loop) | 
|  | 72 |  | 
|  | 73 | /* Input string is not 32-byte aligned.  Rather than forcing | 
|  | 74 | the padding bytes to a safe value, we calculate the syndrome | 
|  | 75 | for all the bytes, but then mask off those bits of the | 
|  | 76 | syndrome that are related to the padding.  */ | 
|  | 77 | ld1	{vdata1.16b, vdata2.16b}, [src], #32 | 
|  | 78 | neg	tmp1, tmp1 | 
|  | 79 | cmeq	vhas_nul1.16b, vdata1.16b, #0 | 
|  | 80 | cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | 
|  | 81 | cmeq	vhas_nul2.16b, vdata2.16b, #0 | 
|  | 82 | cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | 
|  | 83 | orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b | 
|  | 84 | orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b | 
|  | 85 | and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b | 
|  | 86 | and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b | 
|  | 87 | lsl	tmp1, tmp1, #1 | 
|  | 88 | addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128 | 
|  | 89 | mov	tmp3, #~0 | 
|  | 90 | addp	vend1.16b, vend1.16b, vend1.16b		// 128->64 | 
|  | 91 | lsr	tmp1, tmp3, tmp1 | 
|  | 92 |  | 
|  | 93 | mov	tmp3, vend1.2d[0] | 
|  | 94 | bic	tmp1, tmp3, tmp1	// Mask padding bits. | 
|  | 95 | cbnz	tmp1, L(tail) | 
|  | 96 |  | 
|  | 97 | L(loop): | 
|  | 98 | ld1	{vdata1.16b, vdata2.16b}, [src], #32 | 
|  | 99 | cmeq	vhas_nul1.16b, vdata1.16b, #0 | 
|  | 100 | cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | 
|  | 101 | cmeq	vhas_nul2.16b, vdata2.16b, #0 | 
|  | 102 | cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | 
|  | 103 | /* Use a fast check for the termination condition.  */ | 
|  | 104 | orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b | 
|  | 105 | orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b | 
|  | 106 | orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b | 
|  | 107 | addp	vend1.2d, vend1.2d, vend1.2d | 
|  | 108 | mov	tmp1, vend1.2d[0] | 
|  | 109 | cbz	tmp1, L(loop) | 
|  | 110 |  | 
|  | 111 | /* Termination condition found.  Now need to establish exactly why | 
|  | 112 | we terminated.  */ | 
|  | 113 | and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b | 
|  | 114 | and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b | 
|  | 115 | addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128 | 
|  | 116 | addp	vend1.16b, vend1.16b, vend1.16b		// 128->64 | 
|  | 117 |  | 
|  | 118 | mov	tmp1, vend1.2d[0] | 
|  | 119 | L(tail): | 
|  | 120 | /* Count the trailing zeros, by bit reversing...  */ | 
|  | 121 | rbit	tmp1, tmp1 | 
|  | 122 | /* Re-bias source.  */ | 
|  | 123 | sub	src, src, #32 | 
|  | 124 | clz	tmp1, tmp1	/* ... and counting the leading zeros.  */ | 
|  | 125 | /* tmp1 is twice the offset into the fragment.  */ | 
|  | 126 | add	result, src, tmp1, lsr #1 | 
|  | 127 | ret | 
|  | 128 |  | 
|  | 129 | END(__strchrnul) | 
|  | 130 | weak_alias (__strchrnul, strchrnul) |