xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame^] | 1 | /* Copyright (C) 1996-2016 Free Software Foundation, Inc. |
| 2 | Contributed by Richard Henderson (rth@tamu.edu) |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library. If not, see |
| 17 | <http://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | /* Bytewise compare two null-terminated strings of length no longer than N. */ |
| 20 | |
| 21 | #include <sysdep.h> |
| 22 | |
| 23 | .set noat |
| 24 | .set noreorder |
| 25 | |
| 26 | /* EV6 only predicts one branch per octaword. We'll use these to push |
| 27 | subsequent branches back to the next bundle. This will generally add |
| 28 | a fetch+decode cycle to older machines, so skip in that case. */ |
| 29 | #ifdef __alpha_fix__ |
| 30 | # define ev6_unop unop |
| 31 | #else |
| 32 | # define ev6_unop |
| 33 | #endif |
| 34 | |
| 35 | .text |
| 36 | |
| 37 | ENTRY(strncmp) |
| 38 | #ifdef PROF |
| 39 | ldgp gp, 0(pv) |
| 40 | lda AT, _mcount |
| 41 | jsr AT, (AT), _mcount |
| 42 | .prologue 1 |
| 43 | #else |
| 44 | .prologue 0 |
| 45 | #endif |
| 46 | |
| 47 | xor a0, a1, t2 # are s1 and s2 co-aligned? |
| 48 | beq a2, $zerolength |
| 49 | ldq_u t0, 0(a0) # load asap to give cache time to catch up |
| 50 | ldq_u t1, 0(a1) |
| 51 | lda t3, -1 |
| 52 | and t2, 7, t2 |
| 53 | srl t3, 1, t6 |
| 54 | and a0, 7, t4 # find s1 misalignment |
| 55 | and a1, 7, t5 # find s2 misalignment |
| 56 | cmovlt a2, t6, a2 # bound neg count to LONG_MAX |
| 57 | addq a1, a2, a3 # s2+count |
| 58 | addq a2, t4, a2 # bias count by s1 misalignment |
| 59 | and a2, 7, t10 # ofs of last byte in s1 last word |
| 60 | srl a2, 3, a2 # remaining full words in s1 count |
| 61 | bne t2, $unaligned |
| 62 | |
| 63 | /* On entry to this basic block: |
| 64 | t0 == the first word of s1. |
| 65 | t1 == the first word of s2. |
| 66 | t3 == -1. */ |
| 67 | $aligned: |
| 68 | mskqh t3, a1, t8 # mask off leading garbage |
| 69 | ornot t1, t8, t1 |
| 70 | ornot t0, t8, t0 |
| 71 | cmpbge zero, t1, t7 # bits set iff null found |
| 72 | beq a2, $eoc # check end of count |
| 73 | bne t7, $eos |
| 74 | beq t10, $ant_loop |
| 75 | |
| 76 | /* Aligned compare main loop. |
| 77 | On entry to this basic block: |
| 78 | t0 == an s1 word. |
| 79 | t1 == an s2 word not containing a null. */ |
| 80 | |
| 81 | .align 4 |
| 82 | $a_loop: |
| 83 | xor t0, t1, t2 # e0 : |
| 84 | bne t2, $wordcmp # .. e1 (zdb) |
| 85 | ldq_u t1, 8(a1) # e0 : |
| 86 | ldq_u t0, 8(a0) # .. e1 : |
| 87 | |
| 88 | subq a2, 1, a2 # e0 : |
| 89 | addq a1, 8, a1 # .. e1 : |
| 90 | addq a0, 8, a0 # e0 : |
| 91 | beq a2, $eoc # .. e1 : |
| 92 | |
| 93 | cmpbge zero, t1, t7 # e0 : |
| 94 | beq t7, $a_loop # .. e1 : |
| 95 | |
| 96 | br $eos |
| 97 | |
| 98 | /* Alternate aligned compare loop, for when there's no trailing |
| 99 | bytes on the count. We have to avoid reading too much data. */ |
| 100 | .align 4 |
| 101 | $ant_loop: |
| 102 | xor t0, t1, t2 # e0 : |
| 103 | ev6_unop |
| 104 | ev6_unop |
| 105 | bne t2, $wordcmp # .. e1 (zdb) |
| 106 | |
| 107 | subq a2, 1, a2 # e0 : |
| 108 | beq a2, $zerolength # .. e1 : |
| 109 | ldq_u t1, 8(a1) # e0 : |
| 110 | ldq_u t0, 8(a0) # .. e1 : |
| 111 | |
| 112 | addq a1, 8, a1 # e0 : |
| 113 | addq a0, 8, a0 # .. e1 : |
| 114 | cmpbge zero, t1, t7 # e0 : |
| 115 | beq t7, $ant_loop # .. e1 : |
| 116 | |
| 117 | br $eos |
| 118 | |
| 119 | /* The two strings are not co-aligned. Align s1 and cope. */ |
| 120 | /* On entry to this basic block: |
| 121 | t0 == the first word of s1. |
| 122 | t1 == the first word of s2. |
| 123 | t3 == -1. |
| 124 | t4 == misalignment of s1. |
| 125 | t5 == misalignment of s2. |
| 126 | t10 == misalignment of s1 end. */ |
| 127 | .align 4 |
| 128 | $unaligned: |
| 129 | /* If s1 misalignment is larger than s2 misalignment, we need |
| 130 | extra startup checks to avoid SEGV. */ |
| 131 | subq a1, t4, a1 # adjust s2 for s1 misalignment |
| 132 | cmpult t4, t5, t9 |
| 133 | subq a3, 1, a3 # last byte of s2 |
| 134 | bic a1, 7, t8 |
| 135 | mskqh t3, t5, t7 # mask garbage in s2 |
| 136 | subq a3, t8, a3 |
| 137 | ornot t1, t7, t7 |
| 138 | srl a3, 3, a3 # remaining full words in s2 count |
| 139 | beq t9, $u_head |
| 140 | |
| 141 | /* Failing that, we need to look for both eos and eoc within the |
| 142 | first word of s2. If we find either, we can continue by |
| 143 | pretending that the next word of s2 is all zeros. */ |
| 144 | lda t2, 0 # next = zero |
| 145 | cmpeq a3, 0, t8 # eoc in the first word of s2? |
| 146 | cmpbge zero, t7, t7 # eos in the first word of s2? |
| 147 | or t7, t8, t8 |
| 148 | bne t8, $u_head_nl |
| 149 | |
| 150 | /* We know just enough now to be able to assemble the first |
| 151 | full word of s2. We can still find a zero at the end of it. |
| 152 | |
| 153 | On entry to this basic block: |
| 154 | t0 == first word of s1 |
| 155 | t1 == first partial word of s2. |
| 156 | t3 == -1. |
| 157 | t10 == ofs of last byte in s1 last word. |
| 158 | t11 == ofs of last byte in s2 last word. */ |
| 159 | $u_head: |
| 160 | ldq_u t2, 8(a1) # load second partial s2 word |
| 161 | subq a3, 1, a3 |
| 162 | $u_head_nl: |
| 163 | extql t1, a1, t1 # create first s2 word |
| 164 | mskqh t3, a0, t8 |
| 165 | extqh t2, a1, t4 |
| 166 | ornot t0, t8, t0 # kill s1 garbage |
| 167 | or t1, t4, t1 # s2 word now complete |
| 168 | cmpbge zero, t0, t7 # find eos in first s1 word |
| 169 | ornot t1, t8, t1 # kill s2 garbage |
| 170 | beq a2, $eoc |
| 171 | subq a2, 1, a2 |
| 172 | bne t7, $eos |
| 173 | mskql t3, a1, t8 # mask out s2[1] bits we have seen |
| 174 | xor t0, t1, t4 # compare aligned words |
| 175 | or t2, t8, t8 |
| 176 | bne t4, $wordcmp |
| 177 | cmpbge zero, t8, t7 # eos in high bits of s2[1]? |
| 178 | cmpeq a3, 0, t8 # eoc in s2[1]? |
| 179 | or t7, t8, t7 |
| 180 | bne t7, $u_final |
| 181 | |
| 182 | /* Unaligned copy main loop. In order to avoid reading too much, |
| 183 | the loop is structured to detect zeros in aligned words from s2. |
| 184 | This has, unfortunately, effectively pulled half of a loop |
| 185 | iteration out into the head and half into the tail, but it does |
| 186 | prevent nastiness from accumulating in the very thing we want |
| 187 | to run as fast as possible. |
| 188 | |
| 189 | On entry to this basic block: |
| 190 | t2 == the unshifted low-bits from the next s2 word. |
| 191 | t10 == ofs of last byte in s1 last word. |
| 192 | t11 == ofs of last byte in s2 last word. */ |
| 193 | .align 4 |
| 194 | $u_loop: |
| 195 | extql t2, a1, t3 # e0 : |
| 196 | ldq_u t2, 16(a1) # .. e1 : load next s2 high bits |
| 197 | ldq_u t0, 8(a0) # e0 : load next s1 word |
| 198 | addq a1, 8, a1 # .. e1 : |
| 199 | |
| 200 | addq a0, 8, a0 # e0 : |
| 201 | subq a3, 1, a3 # .. e1 : |
| 202 | extqh t2, a1, t1 # e0 : |
| 203 | cmpbge zero, t0, t7 # .. e1 : eos in current s1 word |
| 204 | |
| 205 | or t1, t3, t1 # e0 : |
| 206 | beq a2, $eoc # .. e1 : eoc in current s1 word |
| 207 | subq a2, 1, a2 # e0 : |
| 208 | cmpbge zero, t2, t4 # .. e1 : eos in s2[1] |
| 209 | |
| 210 | xor t0, t1, t3 # e0 : compare the words |
| 211 | ev6_unop |
| 212 | ev6_unop |
| 213 | bne t7, $eos # .. e1 : |
| 214 | |
| 215 | cmpeq a3, 0, t5 # e0 : eoc in s2[1] |
| 216 | ev6_unop |
| 217 | ev6_unop |
| 218 | bne t3, $wordcmp # .. e1 : |
| 219 | |
| 220 | or t4, t5, t4 # e0 : eos or eoc in s2[1]. |
| 221 | beq t4, $u_loop # .. e1 (zdb) |
| 222 | |
| 223 | /* We've found a zero in the low bits of the last s2 word. Get |
| 224 | the next s1 word and align them. */ |
| 225 | .align 3 |
| 226 | $u_final: |
| 227 | ldq_u t0, 8(a0) |
| 228 | extql t2, a1, t1 |
| 229 | cmpbge zero, t1, t7 |
| 230 | bne a2, $eos |
| 231 | |
| 232 | /* We've hit end of count. Zero everything after the count |
| 233 | and compare whats left. */ |
| 234 | .align 3 |
| 235 | $eoc: |
| 236 | mskql t0, t10, t0 |
| 237 | mskql t1, t10, t1 |
| 238 | cmpbge zero, t1, t7 |
| 239 | |
| 240 | /* We've found a zero somewhere in a word we just read. |
| 241 | On entry to this basic block: |
| 242 | t0 == s1 word |
| 243 | t1 == s2 word |
| 244 | t7 == cmpbge mask containing the zero. */ |
| 245 | .align 3 |
| 246 | $eos: |
| 247 | negq t7, t6 # create bytemask of valid data |
| 248 | and t6, t7, t8 |
| 249 | subq t8, 1, t6 |
| 250 | or t6, t8, t7 |
| 251 | zapnot t0, t7, t0 # kill the garbage |
| 252 | zapnot t1, t7, t1 |
| 253 | xor t0, t1, v0 # ... and compare |
| 254 | beq v0, $done |
| 255 | |
| 256 | /* Here we have two differing co-aligned words in t0 & t1. |
| 257 | Bytewise compare them and return (t0 > t1 ? 1 : -1). */ |
| 258 | .align 3 |
| 259 | $wordcmp: |
| 260 | cmpbge t0, t1, t2 # comparison yields bit mask of ge |
| 261 | cmpbge t1, t0, t3 |
| 262 | xor t2, t3, t0 # bits set iff t0/t1 bytes differ |
| 263 | negq t0, t1 # clear all but least bit |
| 264 | and t0, t1, t0 |
| 265 | lda v0, -1 |
| 266 | and t0, t2, t1 # was bit set in t0 > t1? |
| 267 | cmovne t1, 1, v0 |
| 268 | $done: |
| 269 | ret |
| 270 | |
| 271 | .align 3 |
| 272 | $zerolength: |
| 273 | clr v0 |
| 274 | ret |
| 275 | |
| 276 | END(strncmp) |
| 277 | libc_hidden_builtin_def (strncmp) |