lh | 9ed821d | 2023-04-07 01:36:19 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com> |
| 3 | * |
| 4 | * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball |
| 5 | */ |
| 6 | |
| 7 | #if !defined _STRING_H |
| 8 | #error "Never use <libc-string_i386.h> directly; include <string.h> instead" |
| 9 | #endif |
| 10 | |
| 11 | #ifndef _LIBC_STRING_i386_H |
| 12 | #define _LIBC_STRING_i386_H 1 |
| 13 | |
| 14 | static __always_inline |
| 15 | void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count) |
| 16 | { |
| 17 | int ecx, edi; |
| 18 | |
| 19 | if (count == 0) |
| 20 | return s; |
| 21 | |
| 22 | /* Very small (2 stores or less) are best done with direct |
| 23 | * mov <const>,<mem> instructions (they do not clobber registers) */ |
| 24 | if (count == 1) { |
| 25 | *(char *)(s + 0) = eax; |
| 26 | return s; |
| 27 | } |
| 28 | |
| 29 | /* You wonder why & 0xff is needed? Try memset(p, '\xff', size). |
| 30 | * If char is signed, '\xff' == -1! */ |
| 31 | eax = (eax & 0xff) * 0x01010101; /* done at compile time */ |
| 32 | |
| 33 | if (count == 2) { |
| 34 | *(short *)(s + 0) = eax; |
| 35 | return s; |
| 36 | } |
| 37 | if (count == 3) { |
| 38 | *(short *)(s + 0) = eax; |
| 39 | *(char *) (s + 2) = eax; |
| 40 | return s; |
| 41 | } |
| 42 | if (count == 1*4 + 0) { |
| 43 | *(int *)(s + 0) = eax; |
| 44 | return s; |
| 45 | } |
| 46 | if (count == 1*4 + 1) { |
| 47 | *(int *) (s + 0) = eax; |
| 48 | *(char *)(s + 4) = eax; |
| 49 | return s; |
| 50 | } |
| 51 | if (count == 1*4 + 2) { |
| 52 | *(int *) (s + 0) = eax; |
| 53 | *(short *)(s + 4) = eax; |
| 54 | return s; |
| 55 | } |
| 56 | |
| 57 | /* Small string stores: don't clobber ecx |
| 58 | * (clobbers only eax and edi) */ |
| 59 | #define small_store(arg) { \ |
| 60 | __asm__ __volatile__( \ |
| 61 | arg \ |
| 62 | : "=&D" (edi) \ |
| 63 | : "a" (eax), "0" (s) \ |
| 64 | : "memory" \ |
| 65 | ); \ |
| 66 | return s; \ |
| 67 | } |
| 68 | if (count == 1*4 + 3) small_store("stosl; stosw; stosb"); |
| 69 | if (count == 2*4 + 0) { |
| 70 | ((int *)s)[0] = eax; |
| 71 | ((int *)s)[1] = eax; |
| 72 | return s; |
| 73 | } |
| 74 | if (count == 2*4 + 1) small_store("stosl; stosl; stosb"); |
| 75 | if (count == 2*4 + 2) small_store("stosl; stosl; stosw"); |
| 76 | if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb"); |
| 77 | if (count == 3*4 + 0) small_store("stosl; stosl; stosl"); |
| 78 | if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb"); |
| 79 | if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw"); |
| 80 | if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb"); |
| 81 | if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl"); |
| 82 | if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb"); |
| 83 | /* going over 7 bytes is suboptimal */ |
| 84 | /* stosw is 2-byte insn, so this one takes 6 bytes: */ |
| 85 | if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw"); |
| 86 | /* 7 bytes */ |
| 87 | if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb"); |
| 88 | /* 5 bytes */ |
| 89 | if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl"); |
| 90 | /* 6 bytes */ |
| 91 | if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb"); |
| 92 | /* 7 bytes */ |
| 93 | if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw"); |
| 94 | /* 8 bytes, but oh well... */ |
| 95 | if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb"); |
| 96 | /* 6 bytes */ |
| 97 | if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl"); |
| 98 | /* the rest would be 7+ bytes and is handled below instead */ |
| 99 | #undef small_store |
| 100 | |
| 101 | /* Not small, but multiple-of-4 store. |
| 102 | * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */ |
| 103 | __asm__ __volatile__( |
| 104 | " rep; stosl\n" |
| 105 | : "=&c" (ecx), "=&D" (edi) |
| 106 | : "a" (eax), "0" (count / 4), "1" (s) |
| 107 | : "memory" |
| 108 | ); |
| 109 | return s; |
| 110 | } |
| 111 | #if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */ |
| 112 | #define memset(s, c, count) ( \ |
| 113 | ( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \ |
| 114 | || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ |
| 115 | ) \ |
| 116 | ? memset((s), (c), (count)) \ |
| 117 | : inlined_memset_const_c_count4((s), (c), (count)) \ |
| 118 | ) |
| 119 | #endif |
| 120 | |
| 121 | |
| 122 | static __always_inline |
| 123 | void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count) |
| 124 | { |
| 125 | int ecx; |
| 126 | char *esi, *edi; |
| 127 | |
| 128 | if (count == 0) |
| 129 | return d; |
| 130 | |
| 131 | if (count == 1) { |
| 132 | *(char *)d = *(char *)s; |
| 133 | return d + 1; |
| 134 | } |
| 135 | if (count == 2) { |
| 136 | *(short *)d = *(short *)s; |
| 137 | return d + 2; |
| 138 | } |
| 139 | /* Small string moves: don't clobber ecx |
| 140 | * (clobbers only esi and edi) */ |
| 141 | #define small_move(arg) { \ |
| 142 | __asm__ __volatile__( \ |
| 143 | arg \ |
| 144 | : "=&S" (esi), "=&D" (edi) \ |
| 145 | : "0" (s), "1" (d) \ |
| 146 | : "memory" \ |
| 147 | ); \ |
| 148 | return edi; \ |
| 149 | } |
| 150 | if (count == 3) small_move("movsw; movsb"); |
| 151 | if (count == 1*4 + 0) { |
| 152 | *(int *)d = *(int *)s; |
| 153 | return d + 4; |
| 154 | } |
| 155 | if (count == 1*4 + 1) small_move("movsl; movsb"); |
| 156 | if (count == 1*4 + 2) small_move("movsl; movsw"); |
| 157 | if (count == 1*4 + 3) small_move("movsl; movsw; movsb"); |
| 158 | if (count == 2*4 + 0) small_move("movsl; movsl"); |
| 159 | if (count == 2*4 + 1) small_move("movsl; movsl; movsb"); |
| 160 | if (count == 2*4 + 2) small_move("movsl; movsl; movsw"); |
| 161 | if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb"); |
| 162 | if (count == 3*4 + 0) small_move("movsl; movsl; movsl"); |
| 163 | if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb"); |
| 164 | if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw"); |
| 165 | if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb"); |
| 166 | if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl"); |
| 167 | if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb"); |
| 168 | /* going over 7 bytes is suboptimal */ |
| 169 | /* movsw is 2-byte insn, so this one takes 6 bytes: */ |
| 170 | if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw"); |
| 171 | /* 7 bytes */ |
| 172 | if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb"); |
| 173 | /* 5 bytes */ |
| 174 | if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl"); |
| 175 | /* 6 bytes */ |
| 176 | if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb"); |
| 177 | /* 7 bytes */ |
| 178 | if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw"); |
| 179 | /* 8 bytes, but oh well... */ |
| 180 | if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb"); |
| 181 | /* 6 bytes */ |
| 182 | if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl"); |
| 183 | /* the rest would be 7+ bytes and is handled below instead */ |
| 184 | #undef small_move |
| 185 | |
| 186 | /* Not small, but multiple-of-4 move. |
| 187 | * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */ |
| 188 | __asm__ __volatile__( |
| 189 | " rep; movsl\n" |
| 190 | : "=&c" (ecx), "=&S" (esi), "=&D" (edi) |
| 191 | : "0" (count / 4), "1" (s), "2" (d) |
| 192 | : "memory" |
| 193 | ); |
| 194 | return edi; |
| 195 | } |
| 196 | static __always_inline |
| 197 | void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count) |
| 198 | { |
| 199 | inlined_mempcpy_const_count4(d, s, count); |
| 200 | return d; |
| 201 | } |
| 202 | #if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */ |
| 203 | #define mempcpy(d, s, count) ( \ |
| 204 | ( !(__builtin_constant_p(count)) \ |
| 205 | || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ |
| 206 | ) \ |
| 207 | ? mempcpy((d), (s), (count)) \ |
| 208 | : inlined_mempcpy_const_count4((d), (s), (count)) \ |
| 209 | ) |
| 210 | #define memcpy(d, s, count) ( \ |
| 211 | ( !(__builtin_constant_p(count)) \ |
| 212 | || ((count) > (6*4 + 0) && ((count) % 4) != 0) \ |
| 213 | ) \ |
| 214 | ? memcpy((d), (s), (count)) \ |
| 215 | : inlined_memcpy_const_count4((d), (s), (count)) \ |
| 216 | ) |
| 217 | #endif |
| 218 | |
| 219 | |
| 220 | static __always_inline |
| 221 | size_t inlined_strlen(const char *s) |
| 222 | { |
| 223 | int edi; |
| 224 | int ecx; |
| 225 | __asm__ __volatile__( |
| 226 | " repne; scasb\n" |
| 227 | /* " notl %0\n" */ |
| 228 | /* " decl %0\n" */ |
| 229 | : "=c" (ecx), "=&D" (edi) |
| 230 | : "1" (s), "a" (0), "0" (0xffffffffu) |
| 231 | /* : no clobbers */ |
| 232 | ); |
| 233 | return -ecx - 1; |
| 234 | } |
| 235 | #if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */ |
| 236 | #define strlen(s) inlined_strlen(s) |
| 237 | #endif |
| 238 | |
| 239 | |
| 240 | static __always_inline |
| 241 | char *inlined_stpcpy(char *dest, const char *src) |
| 242 | { |
| 243 | char *esi, *edi; |
| 244 | int eax; |
| 245 | __asm__ __volatile__( |
| 246 | "1: lodsb\n" |
| 247 | " stosb\n" |
| 248 | " testb %%al, %%al\n" |
| 249 | " jnz 1b\n" |
| 250 | : "=&S" (esi), "=&D" (edi), "=&a" (eax) |
| 251 | : "0" (src), "1" (dest) |
| 252 | : "memory" |
| 253 | ); |
| 254 | return edi - 1; |
| 255 | } |
| 256 | static __always_inline |
| 257 | char *inlined_strcpy(char *dest, const char *src) |
| 258 | { |
| 259 | inlined_stpcpy(dest, src); |
| 260 | return dest; |
| 261 | } |
| 262 | #if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */ |
| 263 | #define stpcpy(dest, src) inlined_stpcpy(dest, src) |
| 264 | #define strcpy(dest, src) inlined_strcpy(dest, src) |
| 265 | #endif |
| 266 | |
| 267 | |
| 268 | static __always_inline |
| 269 | void *inlined_memchr(const void *s, int c, size_t count) |
| 270 | { |
| 271 | void *edi; |
| 272 | int ecx; |
| 273 | /* Unfortunately, c gets loaded to %eax (wide insn), not %al */ |
| 274 | __asm__ __volatile__( |
| 275 | " jecxz 1f\n" |
| 276 | " repne; scasb\n" |
| 277 | " leal -1(%%edi), %%edi\n" |
| 278 | " je 2f\n" |
| 279 | "1:\n" |
| 280 | " xorl %%edi, %%edi\n" |
| 281 | "2:\n" |
| 282 | : "=&D" (edi), "=&c" (ecx) |
| 283 | : "a" (c), "0" (s), "1" (count) |
| 284 | /* : no clobbers */ |
| 285 | ); |
| 286 | return edi; |
| 287 | } |
| 288 | static __always_inline |
| 289 | void *inlined_memchr_const_c(const void *s, int c, size_t count) |
| 290 | { |
| 291 | #if defined __OPTIMIZE__ |
| 292 | void *edi; |
| 293 | int ecx, eax; |
| 294 | __asm__ __volatile__( |
| 295 | " jecxz 1f\n" |
| 296 | " movb %4, %%al\n" /* const c to %%al */ |
| 297 | " repne; scasb\n" |
| 298 | " leal -1(%%edi), %%edi\n" |
| 299 | " je 2f\n" |
| 300 | "1:\n" |
| 301 | " xorl %%edi, %%edi\n" |
| 302 | "2:\n" |
| 303 | : "=&D" (edi), "=&c" (ecx), "=&a" (eax) |
| 304 | : "0" (s), "i" (c), "1" (count) |
| 305 | /* : no clobbers */ |
| 306 | ); |
| 307 | return edi; |
| 308 | #else |
| 309 | /* With -O0, gcc can't figure out how to encode CONST c |
| 310 | * as an immediate operand. Generating slightly bigger code |
| 311 | * (usually "movl CONST,%eax", 3 bytes bigger than needed): |
| 312 | */ |
| 313 | void *edi; |
| 314 | int ecx, eax; |
| 315 | __asm__ __volatile__( |
| 316 | " jecxz 1f\n" |
| 317 | " repne; scasb\n" |
| 318 | " leal -1(%%edi), %%edi\n" |
| 319 | " je 2f\n" |
| 320 | "1:\n" |
| 321 | " xorl %%edi, %%edi\n" |
| 322 | "2:\n" |
| 323 | : "=&D" (edi), "=&c" (ecx), "=&a" (eax) |
| 324 | : "0" (s), "2" (c), "1" (count) |
| 325 | /* : no clobbers */ |
| 326 | ); |
| 327 | return edi; |
| 328 | #endif |
| 329 | } |
| 330 | #if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */ |
| 331 | #define memchr(s, c, count) ( \ |
| 332 | __builtin_constant_p(c) \ |
| 333 | ? inlined_memchr_const_c(s, (c) & 0xff, count) \ |
| 334 | : inlined_memchr(s, c, count) \ |
| 335 | ) |
| 336 | #endif |
| 337 | |
| 338 | #endif /* _LIBC_STRING_i386_H */ |