| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* Copyright (C) 2012-2016 Free Software Foundation, Inc. | 
|  | 2 |  | 
|  | 3 | This file is part of the GNU C Library. | 
|  | 4 |  | 
|  | 5 | The GNU C Library is free software; you can redistribute it and/or | 
|  | 6 | modify it under the terms of the GNU Lesser General Public | 
|  | 7 | License as published by the Free Software Foundation; either | 
|  | 8 | version 2.1 of the License, or (at your option) any later version. | 
|  | 9 |  | 
|  | 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|  | 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 13 | Lesser General Public License for more details. | 
|  | 14 |  | 
|  | 15 | You should have received a copy of the GNU Lesser General Public | 
|  | 16 | License along with the GNU C Library.  If not, see | 
|  | 17 | <http://www.gnu.org/licenses/>.  */ | 
|  | 18 |  | 
|  | 19 | /* Assumptions: | 
|  | 20 | * | 
|  | 21 | * ARMv8-a, AArch64 | 
|  | 22 | */ | 
|  | 23 |  | 
|  | 24 | #include <sysdep.h> | 
|  | 25 |  | 
|  | 26 | #define REP8_01 0x0101010101010101 | 
|  | 27 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | 
|  | 28 | #define REP8_80 0x8080808080808080 | 
|  | 29 |  | 
|  | 30 | /* Parameters and result.  */ | 
|  | 31 | #define src1		x0 | 
|  | 32 | #define src2		x1 | 
|  | 33 | #define result		x0 | 
|  | 34 |  | 
|  | 35 | /* Internal variables.  */ | 
|  | 36 | #define data1		x2 | 
|  | 37 | #define data1w		w2 | 
|  | 38 | #define data2		x3 | 
|  | 39 | #define data2w		w3 | 
|  | 40 | #define has_nul		x4 | 
|  | 41 | #define diff		x5 | 
|  | 42 | #define syndrome	x6 | 
|  | 43 | #define tmp1		x7 | 
|  | 44 | #define tmp2		x8 | 
|  | 45 | #define tmp3		x9 | 
|  | 46 | #define zeroones	x10 | 
|  | 47 | #define pos		x11 | 
|  | 48 |  | 
|  | 49 | /* Start of performance-critical section  -- one 64B cache line.  */ | 
|  | 50 | ENTRY_ALIGN(strcmp, 6) | 
|  | 51 |  | 
|  | 52 | eor	tmp1, src1, src2 | 
|  | 53 | mov	zeroones, #REP8_01 | 
|  | 54 | tst	tmp1, #7 | 
|  | 55 | b.ne	L(misaligned8) | 
|  | 56 | ands	tmp1, src1, #7 | 
|  | 57 | b.ne	L(mutual_align) | 
|  | 58 | /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 | 
|  | 59 | (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | 
|  | 60 | can be done in parallel across the entire word.  */ | 
|  | 61 | L(loop_aligned): | 
|  | 62 | ldr	data1, [src1], #8 | 
|  | 63 | ldr	data2, [src2], #8 | 
|  | 64 | L(start_realigned): | 
|  | 65 | sub	tmp1, data1, zeroones | 
|  | 66 | orr	tmp2, data1, #REP8_7f | 
|  | 67 | eor	diff, data1, data2	/* Non-zero if differences found.  */ | 
|  | 68 | bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */ | 
|  | 69 | orr	syndrome, diff, has_nul | 
|  | 70 | cbz	syndrome, L(loop_aligned) | 
|  | 71 | /* End of performance-critical section  -- one 64B cache line.  */ | 
|  | 72 |  | 
|  | 73 | #ifndef	__AARCH64EB__ | 
|  | 74 | rev	syndrome, syndrome | 
|  | 75 | rev	data1, data1 | 
|  | 76 | /* The MS-non-zero bit of the syndrome marks either the first bit | 
|  | 77 | that is different, or the top bit of the first zero byte. | 
|  | 78 | Shifting left now will bring the critical information into the | 
|  | 79 | top bits.  */ | 
|  | 80 | clz	pos, syndrome | 
|  | 81 | rev	data2, data2 | 
|  | 82 | lsl	data1, data1, pos | 
|  | 83 | lsl	data2, data2, pos | 
|  | 84 | /* But we need to zero-extend (char is unsigned) the value and then | 
|  | 85 | perform a signed 32-bit subtraction.  */ | 
|  | 86 | lsr	data1, data1, #56 | 
|  | 87 | sub	result, data1, data2, lsr #56 | 
|  | 88 | RET | 
|  | 89 | #else | 
|  | 90 | /* For big-endian we cannot use the trick with the syndrome value | 
|  | 91 | as carry-propagation can corrupt the upper bits if the trailing | 
|  | 92 | bytes in the string contain 0x01.  */ | 
|  | 93 | /* However, if there is no NUL byte in the dword, we can generate | 
|  | 94 | the result directly.  We can't just subtract the bytes as the | 
|  | 95 | MSB might be significant.  */ | 
|  | 96 | cbnz	has_nul, 1f | 
|  | 97 | cmp	data1, data2 | 
|  | 98 | cset	result, ne | 
|  | 99 | cneg	result, result, lo | 
|  | 100 | RET | 
|  | 101 | 1: | 
|  | 102 | /* Re-compute the NUL-byte detection, using a byte-reversed value.  */ | 
|  | 103 | rev	tmp3, data1 | 
|  | 104 | sub	tmp1, tmp3, zeroones | 
|  | 105 | orr	tmp2, tmp3, #REP8_7f | 
|  | 106 | bic	has_nul, tmp1, tmp2 | 
|  | 107 | rev	has_nul, has_nul | 
|  | 108 | orr	syndrome, diff, has_nul | 
|  | 109 | clz	pos, syndrome | 
|  | 110 | /* The MS-non-zero bit of the syndrome marks either the first bit | 
|  | 111 | that is different, or the top bit of the first zero byte. | 
|  | 112 | Shifting left now will bring the critical information into the | 
|  | 113 | top bits.  */ | 
|  | 114 | lsl	data1, data1, pos | 
|  | 115 | lsl	data2, data2, pos | 
|  | 116 | /* But we need to zero-extend (char is unsigned) the value and then | 
|  | 117 | perform a signed 32-bit subtraction.  */ | 
|  | 118 | lsr	data1, data1, #56 | 
|  | 119 | sub	result, data1, data2, lsr #56 | 
|  | 120 | RET | 
|  | 121 | #endif | 
|  | 122 |  | 
|  | 123 | L(mutual_align): | 
|  | 124 | /* Sources are mutually aligned, but are not currently at an | 
|  | 125 | alignment boundary.  Round down the addresses and then mask off | 
|  | 126 | the bytes that preceed the start point.  */ | 
|  | 127 | bic	src1, src1, #7 | 
|  | 128 | bic	src2, src2, #7 | 
|  | 129 | lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ | 
|  | 130 | ldr	data1, [src1], #8 | 
|  | 131 | neg	tmp1, tmp1		/* Bits to alignment -64.  */ | 
|  | 132 | ldr	data2, [src2], #8 | 
|  | 133 | mov	tmp2, #~0 | 
|  | 134 | #ifdef __AARCH64EB__ | 
|  | 135 | /* Big-endian.  Early bytes are at MSB.  */ | 
|  | 136 | lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */ | 
|  | 137 | #else | 
|  | 138 | /* Little-endian.  Early bytes are at LSB.  */ | 
|  | 139 | lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */ | 
|  | 140 | #endif | 
|  | 141 | orr	data1, data1, tmp2 | 
|  | 142 | orr	data2, data2, tmp2 | 
|  | 143 | b	L(start_realigned) | 
|  | 144 |  | 
|  | 145 | L(misaligned8): | 
|  | 146 | /* We can do better than this.  */ | 
|  | 147 | ldrb	data1w, [src1], #1 | 
|  | 148 | ldrb	data2w, [src2], #1 | 
|  | 149 | cmp	data1w, #1 | 
|  | 150 | ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */ | 
|  | 151 | b.eq	L(misaligned8) | 
|  | 152 | sub	result, data1, data2 | 
|  | 153 | RET | 
|  | 154 | END(strcmp) | 
|  | 155 | libc_hidden_builtin_def (strcmp) |