| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* Copyright (C) 1996-2016 Free Software Foundation, Inc. | 
|  | 2 | Contributed by Richard Henderson (rth@tamu.edu) | 
|  | 3 | This file is part of the GNU C Library. | 
|  | 4 |  | 
|  | 5 | The GNU C Library is free software; you can redistribute it and/or | 
|  | 6 | modify it under the terms of the GNU Lesser General Public | 
|  | 7 | License as published by the Free Software Foundation; either | 
|  | 8 | version 2.1 of the License, or (at your option) any later version. | 
|  | 9 |  | 
|  | 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|  | 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 13 | Lesser General Public License for more details. | 
|  | 14 |  | 
|  | 15 | You should have received a copy of the GNU Lesser General Public | 
|  | 16 | License along with the GNU C Library.  If not, see | 
|  | 17 | <http://www.gnu.org/licenses/>.  */ | 
|  | 18 |  | 
|  | 19 | /* Bytewise compare two null-terminated strings of length no longer than N.  */ | 
|  | 20 |  | 
|  | 21 | #include <sysdep.h> | 
|  | 22 |  | 
|  | 23 | .set noat | 
|  | 24 | .set noreorder | 
|  | 25 |  | 
|  | 26 | /* EV6 only predicts one branch per octaword.  We'll use these to push | 
|  | 27 | subsequent branches back to the next bundle.  This will generally add | 
|  | 28 | a fetch+decode cycle to older machines, so skip in that case.  */ | 
|  | 29 | #ifdef __alpha_fix__ | 
|  | 30 | # define ev6_unop	unop | 
|  | 31 | #else | 
|  | 32 | # define ev6_unop | 
|  | 33 | #endif | 
|  | 34 |  | 
|  | 35 | .text | 
|  | 36 |  | 
|  | 37 | ENTRY(strncmp) | 
|  | 38 | #ifdef PROF | 
|  | 39 | ldgp	gp, 0(pv) | 
|  | 40 | lda	AT, _mcount | 
|  | 41 | jsr	AT, (AT), _mcount | 
|  | 42 | .prologue 1 | 
|  | 43 | #else | 
|  | 44 | .prologue 0 | 
|  | 45 | #endif | 
|  | 46 |  | 
|  | 47 | xor	a0, a1, t2	# are s1 and s2 co-aligned? | 
|  | 48 | beq	a2, $zerolength | 
|  | 49 | ldq_u	t0, 0(a0)	# load asap to give cache time to catch up | 
|  | 50 | ldq_u	t1, 0(a1) | 
|  | 51 | lda	t3, -1 | 
|  | 52 | and	t2, 7, t2 | 
|  | 53 | srl	t3, 1, t6 | 
|  | 54 | and	a0, 7, t4	# find s1 misalignment | 
|  | 55 | and	a1, 7, t5	# find s2 misalignment | 
|  | 56 | cmovlt	a2, t6, a2	# bound neg count to LONG_MAX | 
|  | 57 | addq	a1, a2, a3	# s2+count | 
|  | 58 | addq	a2, t4, a2	# bias count by s1 misalignment | 
|  | 59 | and	a2, 7, t10	# ofs of last byte in s1 last word | 
|  | 60 | srl	a2, 3, a2	# remaining full words in s1 count | 
|  | 61 | bne	t2, $unaligned | 
|  | 62 |  | 
|  | 63 | /* On entry to this basic block: | 
|  | 64 | t0 == the first word of s1. | 
|  | 65 | t1 == the first word of s2. | 
|  | 66 | t3 == -1.  */ | 
|  | 67 | $aligned: | 
|  | 68 | mskqh	t3, a1, t8	# mask off leading garbage | 
|  | 69 | ornot	t1, t8, t1 | 
|  | 70 | ornot	t0, t8, t0 | 
|  | 71 | cmpbge	zero, t1, t7	# bits set iff null found | 
|  | 72 | beq	a2, $eoc	# check end of count | 
|  | 73 | bne	t7, $eos | 
|  | 74 | beq	t10, $ant_loop | 
|  | 75 |  | 
|  | 76 | /* Aligned compare main loop. | 
|  | 77 | On entry to this basic block: | 
|  | 78 | t0 == an s1 word. | 
|  | 79 | t1 == an s2 word not containing a null.  */ | 
|  | 80 |  | 
|  | 81 | .align 4 | 
|  | 82 | $a_loop: | 
|  | 83 | xor	t0, t1, t2	# e0	: | 
|  | 84 | bne	t2, $wordcmp	# .. e1 (zdb) | 
|  | 85 | ldq_u	t1, 8(a1)	# e0    : | 
|  | 86 | ldq_u	t0, 8(a0)	# .. e1 : | 
|  | 87 |  | 
|  | 88 | subq	a2, 1, a2	# e0    : | 
|  | 89 | addq	a1, 8, a1	# .. e1 : | 
|  | 90 | addq	a0, 8, a0	# e0    : | 
|  | 91 | beq	a2, $eoc	# .. e1 : | 
|  | 92 |  | 
|  | 93 | cmpbge	zero, t1, t7	# e0    : | 
|  | 94 | beq	t7, $a_loop	# .. e1 : | 
|  | 95 |  | 
|  | 96 | br	$eos | 
|  | 97 |  | 
|  | 98 | /* Alternate aligned compare loop, for when there's no trailing | 
|  | 99 | bytes on the count.  We have to avoid reading too much data.  */ | 
|  | 100 | .align 4 | 
|  | 101 | $ant_loop: | 
|  | 102 | xor	t0, t1, t2	# e0	: | 
|  | 103 | ev6_unop | 
|  | 104 | ev6_unop | 
|  | 105 | bne	t2, $wordcmp	# .. e1 (zdb) | 
|  | 106 |  | 
|  | 107 | subq	a2, 1, a2	# e0    : | 
|  | 108 | beq	a2, $zerolength	# .. e1 : | 
|  | 109 | ldq_u	t1, 8(a1)	# e0    : | 
|  | 110 | ldq_u	t0, 8(a0)	# .. e1 : | 
|  | 111 |  | 
|  | 112 | addq	a1, 8, a1	# e0    : | 
|  | 113 | addq	a0, 8, a0	# .. e1 : | 
|  | 114 | cmpbge	zero, t1, t7	# e0    : | 
|  | 115 | beq	t7, $ant_loop	# .. e1 : | 
|  | 116 |  | 
|  | 117 | br	$eos | 
|  | 118 |  | 
|  | 119 | /* The two strings are not co-aligned.  Align s1 and cope.  */ | 
|  | 120 | /* On entry to this basic block: | 
|  | 121 | t0 == the first word of s1. | 
|  | 122 | t1 == the first word of s2. | 
|  | 123 | t3 == -1. | 
|  | 124 | t4 == misalignment of s1. | 
|  | 125 | t5 == misalignment of s2. | 
|  | 126 | t10 == misalignment of s1 end.  */ | 
|  | 127 | .align	4 | 
|  | 128 | $unaligned: | 
|  | 129 | /* If s1 misalignment is larger than s2 misalignment, we need | 
|  | 130 | extra startup checks to avoid SEGV.  */ | 
|  | 131 | subq	a1, t4, a1	# adjust s2 for s1 misalignment | 
|  | 132 | cmpult	t4, t5, t9 | 
|  | 133 | subq	a3, 1, a3	# last byte of s2 | 
|  | 134 | bic	a1, 7, t8 | 
|  | 135 | mskqh	t3, t5, t7	# mask garbage in s2 | 
|  | 136 | subq	a3, t8, a3 | 
|  | 137 | ornot	t1, t7, t7 | 
|  | 138 | srl	a3, 3, a3	# remaining full words in s2 count | 
|  | 139 | beq	t9, $u_head | 
|  | 140 |  | 
|  | 141 | /* Failing that, we need to look for both eos and eoc within the | 
|  | 142 | first word of s2.  If we find either, we can continue by | 
|  | 143 | pretending that the next word of s2 is all zeros.  */ | 
|  | 144 | lda	t2, 0		# next = zero | 
|  | 145 | cmpeq	a3, 0, t8	# eoc in the first word of s2? | 
|  | 146 | cmpbge	zero, t7, t7	# eos in the first word of s2? | 
|  | 147 | or	t7, t8, t8 | 
|  | 148 | bne	t8, $u_head_nl | 
|  | 149 |  | 
|  | 150 | /* We know just enough now to be able to assemble the first | 
|  | 151 | full word of s2.  We can still find a zero at the end of it. | 
|  | 152 |  | 
|  | 153 | On entry to this basic block: | 
|  | 154 | t0 == first word of s1 | 
|  | 155 | t1 == first partial word of s2. | 
|  | 156 | t3 == -1. | 
|  | 157 | t10 == ofs of last byte in s1 last word. | 
|  | 158 | t11 == ofs of last byte in s2 last word.  */ | 
|  | 159 | $u_head: | 
|  | 160 | ldq_u	t2, 8(a1)	# load second partial s2 word | 
|  | 161 | subq	a3, 1, a3 | 
|  | 162 | $u_head_nl: | 
|  | 163 | extql	t1, a1, t1	# create first s2 word | 
|  | 164 | mskqh	t3, a0, t8 | 
|  | 165 | extqh	t2, a1, t4 | 
|  | 166 | ornot	t0, t8, t0	# kill s1 garbage | 
|  | 167 | or	t1, t4, t1	# s2 word now complete | 
|  | 168 | cmpbge	zero, t0, t7	# find eos in first s1 word | 
|  | 169 | ornot	t1, t8, t1	# kill s2 garbage | 
|  | 170 | beq	a2, $eoc | 
|  | 171 | subq	a2, 1, a2 | 
|  | 172 | bne	t7, $eos | 
|  | 173 | mskql	t3, a1, t8	# mask out s2[1] bits we have seen | 
|  | 174 | xor	t0, t1, t4	# compare aligned words | 
|  | 175 | or	t2, t8, t8 | 
|  | 176 | bne	t4, $wordcmp | 
|  | 177 | cmpbge	zero, t8, t7	# eos in high bits of s2[1]? | 
|  | 178 | cmpeq	a3, 0, t8	# eoc in s2[1]? | 
|  | 179 | or	t7, t8, t7 | 
|  | 180 | bne	t7, $u_final | 
|  | 181 |  | 
|  | 182 | /* Unaligned copy main loop.  In order to avoid reading too much, | 
|  | 183 | the loop is structured to detect zeros in aligned words from s2. | 
|  | 184 | This has, unfortunately, effectively pulled half of a loop | 
|  | 185 | iteration out into the head and half into the tail, but it does | 
|  | 186 | prevent nastiness from accumulating in the very thing we want | 
|  | 187 | to run as fast as possible. | 
|  | 188 |  | 
|  | 189 | On entry to this basic block: | 
|  | 190 | t2 == the unshifted low-bits from the next s2 word. | 
|  | 191 | t10 == ofs of last byte in s1 last word. | 
|  | 192 | t11 == ofs of last byte in s2 last word.  */ | 
|  | 193 | .align 4 | 
|  | 194 | $u_loop: | 
|  | 195 | extql	t2, a1, t3	# e0    : | 
|  | 196 | ldq_u	t2, 16(a1)	# .. e1 : load next s2 high bits | 
|  | 197 | ldq_u	t0, 8(a0)	# e0    : load next s1 word | 
|  | 198 | addq	a1, 8, a1	# .. e1 : | 
|  | 199 |  | 
|  | 200 | addq	a0, 8, a0	# e0    : | 
|  | 201 | subq	a3, 1, a3	# .. e1 : | 
|  | 202 | extqh	t2, a1, t1	# e0    : | 
|  | 203 | cmpbge	zero, t0, t7	# .. e1 : eos in current s1 word | 
|  | 204 |  | 
|  | 205 | or	t1, t3, t1	# e0    : | 
|  | 206 | beq	a2, $eoc	# .. e1 : eoc in current s1 word | 
|  | 207 | subq	a2, 1, a2	# e0    : | 
|  | 208 | cmpbge	zero, t2, t4	# .. e1 : eos in s2[1] | 
|  | 209 |  | 
|  | 210 | xor	t0, t1, t3	# e0    : compare the words | 
|  | 211 | ev6_unop | 
|  | 212 | ev6_unop | 
|  | 213 | bne	t7, $eos	# .. e1 : | 
|  | 214 |  | 
|  | 215 | cmpeq	a3, 0, t5	# e0    : eoc in s2[1] | 
|  | 216 | ev6_unop | 
|  | 217 | ev6_unop | 
|  | 218 | bne	t3, $wordcmp	# .. e1 : | 
|  | 219 |  | 
|  | 220 | or	t4, t5, t4	# e0    : eos or eoc in s2[1]. | 
|  | 221 | beq	t4, $u_loop	# .. e1 (zdb) | 
|  | 222 |  | 
|  | 223 | /* We've found a zero in the low bits of the last s2 word.  Get | 
|  | 224 | the next s1 word and align them.  */ | 
|  | 225 | .align 3 | 
|  | 226 | $u_final: | 
|  | 227 | ldq_u	t0, 8(a0) | 
|  | 228 | extql	t2, a1, t1 | 
|  | 229 | cmpbge	zero, t1, t7 | 
|  | 230 | bne	a2, $eos | 
|  | 231 |  | 
|  | 232 | /* We've hit end of count.  Zero everything after the count | 
|  | 233 | and compare whats left.  */ | 
|  | 234 | .align 3 | 
|  | 235 | $eoc: | 
|  | 236 | mskql	t0, t10, t0 | 
|  | 237 | mskql	t1, t10, t1 | 
|  | 238 | cmpbge	zero, t1, t7 | 
|  | 239 |  | 
|  | 240 | /* We've found a zero somewhere in a word we just read. | 
|  | 241 | On entry to this basic block: | 
|  | 242 | t0 == s1 word | 
|  | 243 | t1 == s2 word | 
|  | 244 | t7 == cmpbge mask containing the zero.  */ | 
|  | 245 | .align 3 | 
|  | 246 | $eos: | 
|  | 247 | negq	t7, t6		# create bytemask of valid data | 
|  | 248 | and	t6, t7, t8 | 
|  | 249 | subq	t8, 1, t6 | 
|  | 250 | or	t6, t8, t7 | 
|  | 251 | zapnot	t0, t7, t0	# kill the garbage | 
|  | 252 | zapnot	t1, t7, t1 | 
|  | 253 | xor	t0, t1, v0	# ... and compare | 
|  | 254 | beq	v0, $done | 
|  | 255 |  | 
|  | 256 | /* Here we have two differing co-aligned words in t0 & t1. | 
|  | 257 | Bytewise compare them and return (t0 > t1 ? 1 : -1).  */ | 
|  | 258 | .align 3 | 
|  | 259 | $wordcmp: | 
|  | 260 | cmpbge	t0, t1, t2	# comparison yields bit mask of ge | 
|  | 261 | cmpbge	t1, t0, t3 | 
|  | 262 | xor	t2, t3, t0	# bits set iff t0/t1 bytes differ | 
|  | 263 | negq	t0, t1		# clear all but least bit | 
|  | 264 | and	t0, t1, t0 | 
|  | 265 | lda	v0, -1 | 
|  | 266 | and	t0, t2, t1	# was bit set in t0 > t1? | 
|  | 267 | cmovne	t1, 1, v0 | 
|  | 268 | $done: | 
|  | 269 | ret | 
|  | 270 |  | 
|  | 271 | .align 3 | 
|  | 272 | $zerolength: | 
|  | 273 | clr	v0 | 
|  | 274 | ret | 
|  | 275 |  | 
|  | 276 | END(strncmp) | 
|  | 277 | libc_hidden_builtin_def (strncmp) |