| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* Copyright (C) 2012-2016 Free Software Foundation, Inc. | 
|  | 2 |  | 
|  | 3 | This file is part of the GNU C Library. | 
|  | 4 |  | 
|  | 5 | The GNU C Library is free software; you can redistribute it and/or | 
|  | 6 | modify it under the terms of the GNU Lesser General Public | 
|  | 7 | License as published by the Free Software Foundation; either | 
|  | 8 | version 2.1 of the License, or (at your option) any later version. | 
|  | 9 |  | 
|  | 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|  | 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 13 | Lesser General Public License for more details. | 
|  | 14 |  | 
|  | 15 | You should have received a copy of the GNU Lesser General Public | 
|  | 16 | License along with the GNU C Library.  If not, see | 
|  | 17 | <http://www.gnu.org/licenses/>.  */ | 
|  | 18 |  | 
|  | 19 | #include <sysdep.h> | 
|  | 20 |  | 
|  | 21 | /* Assumptions: | 
|  | 22 | * | 
|  | 23 | * ARMv8-a, AArch64, unaligned accesses, min page size 4k. | 
|  | 24 | */ | 
|  | 25 |  | 
|  | 26 | /* To test the page crossing code path more thoroughly, compile with | 
|  | 27 | -DTEST_PAGE_CROSS - this will force all calls through the slower | 
|  | 28 | entry path.  This option is not intended for production use.  */ | 
|  | 29 |  | 
|  | 30 | /* Arguments and results.  */ | 
|  | 31 | #define srcin		x0 | 
|  | 32 | #define len		x0 | 
|  | 33 |  | 
|  | 34 | /* Locals and temporaries.  */ | 
|  | 35 | #define src		x1 | 
|  | 36 | #define data1		x2 | 
|  | 37 | #define data2		x3 | 
|  | 38 | #define has_nul1	x4 | 
|  | 39 | #define has_nul2	x5 | 
|  | 40 | #define tmp1		x4 | 
|  | 41 | #define tmp2		x5 | 
|  | 42 | #define tmp3		x6 | 
|  | 43 | #define tmp4		x7 | 
|  | 44 | #define zeroones	x8 | 
|  | 45 |  | 
|  | 46 | /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 | 
|  | 47 | (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | 
|  | 48 | can be done in parallel across the entire word. A faster check | 
|  | 49 | (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives | 
|  | 50 | false hits for characters 129..255.	*/ | 
|  | 51 |  | 
|  | 52 | #define REP8_01 0x0101010101010101 | 
|  | 53 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | 
|  | 54 | #define REP8_80 0x8080808080808080 | 
|  | 55 |  | 
|  | 56 | #ifdef TEST_PAGE_CROSS | 
|  | 57 | # define MIN_PAGE_SIZE 15 | 
|  | 58 | #else | 
|  | 59 | # define MIN_PAGE_SIZE 4096 | 
|  | 60 | #endif | 
|  | 61 |  | 
|  | 62 | /* Since strings are short on average, we check the first 16 bytes | 
|  | 63 | of the string for a NUL character.  In order to do an unaligned ldp | 
|  | 64 | safely we have to do a page cross check first.  If there is a NUL | 
|  | 65 | byte we calculate the length from the 2 8-byte words using | 
|  | 66 | conditional select to reduce branch mispredictions (it is unlikely | 
|  | 67 | strlen will be repeatedly called on strings with the same length). | 
|  | 68 |  | 
|  | 69 | If the string is longer than 16 bytes, we align src so don't need | 
|  | 70 | further page cross checks, and process 32 bytes per iteration | 
|  | 71 | using the fast NUL check.  If we encounter non-ASCII characters, | 
|  | 72 | fallback to a second loop using the full NUL check. | 
|  | 73 |  | 
|  | 74 | If the page cross check fails, we read 16 bytes from an aligned | 
|  | 75 | address, remove any characters before the string, and continue | 
|  | 76 | in the main loop using aligned loads.  Since strings crossing a | 
|  | 77 | page in the first 16 bytes are rare (probability of | 
|  | 78 | 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. | 
|  | 79 |  | 
|  | 80 | AArch64 systems have a minimum page size of 4k.  We don't bother | 
|  | 81 | checking for larger page sizes - the cost of setting up the correct | 
|  | 82 | page size is just not worth the extra gain from a small reduction in | 
|  | 83 | the cases taking the slow path.  Note that we only care about | 
|  | 84 | whether the first fetch, which may be misaligned, crosses a page | 
|  | 85 | boundary.  */ | 
|  | 86 |  | 
|  | 87 | ENTRY_ALIGN (strlen, 6) | 
|  | 88 | and	tmp1, srcin, MIN_PAGE_SIZE - 1 | 
|  | 89 | mov	zeroones, REP8_01 | 
|  | 90 | cmp	tmp1, MIN_PAGE_SIZE - 16 | 
|  | 91 | b.gt	L(page_cross) | 
|  | 92 | ldp	data1, data2, [srcin] | 
|  | 93 | #ifdef __AARCH64EB__ | 
|  | 94 | /* For big-endian, carry propagation (if the final byte in the | 
|  | 95 | string is 0x01) means we cannot use has_nul1/2 directly. | 
|  | 96 | Since we expect strings to be small and early-exit, | 
|  | 97 | byte-swap the data now so has_null1/2 will be correct.  */ | 
|  | 98 | rev	data1, data1 | 
|  | 99 | rev	data2, data2 | 
|  | 100 | #endif | 
|  | 101 | sub	tmp1, data1, zeroones | 
|  | 102 | orr	tmp2, data1, REP8_7f | 
|  | 103 | sub	tmp3, data2, zeroones | 
|  | 104 | orr	tmp4, data2, REP8_7f | 
|  | 105 | bics	has_nul1, tmp1, tmp2 | 
|  | 106 | bic	has_nul2, tmp3, tmp4 | 
|  | 107 | ccmp	has_nul2, 0, 0, eq | 
|  | 108 | beq	L(main_loop_entry) | 
|  | 109 |  | 
|  | 110 | /* Enter with C = has_nul1 == 0.  */ | 
|  | 111 | csel	has_nul1, has_nul1, has_nul2, cc | 
|  | 112 | mov	len, 8 | 
|  | 113 | rev	has_nul1, has_nul1 | 
|  | 114 | clz	tmp1, has_nul1 | 
|  | 115 | csel	len, xzr, len, cc | 
|  | 116 | add	len, len, tmp1, lsr 3 | 
|  | 117 | ret | 
|  | 118 |  | 
|  | 119 | /* The inner loop processes 32 bytes per iteration and uses the fast | 
|  | 120 | NUL check.  If we encounter non-ASCII characters, use a second | 
|  | 121 | loop with the accurate NUL check.  */ | 
|  | 122 | .p2align 4 | 
|  | 123 | L(main_loop_entry): | 
|  | 124 | bic	src, srcin, 15 | 
|  | 125 | sub	src, src, 16 | 
|  | 126 | L(main_loop): | 
|  | 127 | ldp	data1, data2, [src, 32]! | 
|  | 128 | L(page_cross_entry): | 
|  | 129 | sub	tmp1, data1, zeroones | 
|  | 130 | sub	tmp3, data2, zeroones | 
|  | 131 | orr	tmp2, tmp1, tmp3 | 
|  | 132 | tst	tmp2, zeroones, lsl 7 | 
|  | 133 | bne	1f | 
|  | 134 | ldp	data1, data2, [src, 16] | 
|  | 135 | sub	tmp1, data1, zeroones | 
|  | 136 | sub	tmp3, data2, zeroones | 
|  | 137 | orr	tmp2, tmp1, tmp3 | 
|  | 138 | tst	tmp2, zeroones, lsl 7 | 
|  | 139 | beq	L(main_loop) | 
|  | 140 | add	src, src, 16 | 
|  | 141 | 1: | 
|  | 142 | /* The fast check failed, so do the slower, accurate NUL check.	 */ | 
|  | 143 | orr	tmp2, data1, REP8_7f | 
|  | 144 | orr	tmp4, data2, REP8_7f | 
|  | 145 | bics	has_nul1, tmp1, tmp2 | 
|  | 146 | bic	has_nul2, tmp3, tmp4 | 
|  | 147 | ccmp	has_nul2, 0, 0, eq | 
|  | 148 | beq	L(nonascii_loop) | 
|  | 149 |  | 
|  | 150 | /* Enter with C = has_nul1 == 0.  */ | 
|  | 151 | L(tail): | 
|  | 152 | #ifdef __AARCH64EB__ | 
|  | 153 | /* For big-endian, carry propagation (if the final byte in the | 
|  | 154 | string is 0x01) means we cannot use has_nul1/2 directly.  The | 
|  | 155 | easiest way to get the correct byte is to byte-swap the data | 
|  | 156 | and calculate the syndrome a second time.  */ | 
|  | 157 | csel	data1, data1, data2, cc | 
|  | 158 | rev	data1, data1 | 
|  | 159 | sub	tmp1, data1, zeroones | 
|  | 160 | orr	tmp2, data1, REP8_7f | 
|  | 161 | bic	has_nul1, tmp1, tmp2 | 
|  | 162 | #else | 
|  | 163 | csel	has_nul1, has_nul1, has_nul2, cc | 
|  | 164 | #endif | 
|  | 165 | sub	len, src, srcin | 
|  | 166 | rev	has_nul1, has_nul1 | 
|  | 167 | add	tmp2, len, 8 | 
|  | 168 | clz	tmp1, has_nul1 | 
|  | 169 | csel	len, len, tmp2, cc | 
|  | 170 | add	len, len, tmp1, lsr 3 | 
|  | 171 | ret | 
|  | 172 |  | 
|  | 173 | L(nonascii_loop): | 
|  | 174 | ldp	data1, data2, [src, 16]! | 
|  | 175 | sub	tmp1, data1, zeroones | 
|  | 176 | orr	tmp2, data1, REP8_7f | 
|  | 177 | sub	tmp3, data2, zeroones | 
|  | 178 | orr	tmp4, data2, REP8_7f | 
|  | 179 | bics	has_nul1, tmp1, tmp2 | 
|  | 180 | bic	has_nul2, tmp3, tmp4 | 
|  | 181 | ccmp	has_nul2, 0, 0, eq | 
|  | 182 | bne	L(tail) | 
|  | 183 | ldp	data1, data2, [src, 16]! | 
|  | 184 | sub	tmp1, data1, zeroones | 
|  | 185 | orr	tmp2, data1, REP8_7f | 
|  | 186 | sub	tmp3, data2, zeroones | 
|  | 187 | orr	tmp4, data2, REP8_7f | 
|  | 188 | bics	has_nul1, tmp1, tmp2 | 
|  | 189 | bic	has_nul2, tmp3, tmp4 | 
|  | 190 | ccmp	has_nul2, 0, 0, eq | 
|  | 191 | beq	L(nonascii_loop) | 
|  | 192 | b	L(tail) | 
|  | 193 |  | 
|  | 194 | /* Load 16 bytes from [srcin & ~15] and force the bytes that precede | 
|  | 195 | srcin to 0x7f, so we ignore any NUL bytes before the string. | 
|  | 196 | Then continue in the aligned loop.  */ | 
|  | 197 | L(page_cross): | 
|  | 198 | bic	src, srcin, 15 | 
|  | 199 | ldp	data1, data2, [src] | 
|  | 200 | lsl	tmp1, srcin, 3 | 
|  | 201 | mov	tmp4, -1 | 
|  | 202 | #ifdef __AARCH64EB__ | 
|  | 203 | /* Big-endian.	Early bytes are at MSB.	 */ | 
|  | 204 | lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */ | 
|  | 205 | #else | 
|  | 206 | /* Little-endian.  Early bytes are at LSB.  */ | 
|  | 207 | lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */ | 
|  | 208 | #endif | 
|  | 209 | orr	tmp1, tmp1, REP8_80 | 
|  | 210 | orn	data1, data1, tmp1 | 
|  | 211 | orn	tmp2, data2, tmp1 | 
|  | 212 | tst	srcin, 8 | 
|  | 213 | csel	data1, data1, tmp4, eq | 
|  | 214 | csel	data2, data2, tmp2, eq | 
|  | 215 | b	L(page_cross_entry) | 
|  | 216 | END (strlen) | 
|  | 217 | libc_hidden_builtin_def (strlen) |