| /* Optimized strcmp for Xtensa. |
| Copyright (C) 2001, 2007 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, write to the Free |
| Software Foundation, Inc., 51 Franklin Street - Fifth Floor, |
| Boston, MA 02110-1301, USA. */ |
| |
| #include "../../sysdeps/linux/xtensa/sysdep.h" |
| #include <bits/xtensa-config.h> |
| #include <features.h> |
| |
| #ifdef __XTENSA_EB__ |
| #define MASK0 0xff000000 |
| #define MASK1 0x00ff0000 |
| #define MASK2 0x0000ff00 |
| #define MASK3 0x000000ff |
| #else |
| #define MASK0 0x000000ff |
| #define MASK1 0x0000ff00 |
| #define MASK2 0x00ff0000 |
| #define MASK3 0xff000000 |
| #endif |
| |
| #define MASK4 0x40404040 |
| |
| .literal .Lmask0, MASK0 |
| .literal .Lmask1, MASK1 |
| .literal .Lmask2, MASK2 |
| .literal .Lmask3, MASK3 |
| .literal .Lmask4, MASK4 |
| |
| .text |
| ENTRY (strcmp) |
| /* a2 = s1, a3 = s2 */ |
| |
| l8ui a8, a2, 0 /* byte 0 from s1 */ |
| l8ui a9, a3, 0 /* byte 0 from s2 */ |
| movi a10, 3 /* mask */ |
| bne a8, a9, .Lretdiff |
| |
| or a11, a2, a3 |
| bnone a11, a10, .Laligned |
| |
| xor a11, a2, a3 /* compare low two bits of s1 and s2 */ |
| bany a11, a10, .Lunaligned /* if they have different alignment */ |
| |
| /* s1/s2 are not word-aligned. */ |
| addi a2, a2, 1 /* advance s1 */ |
| beqz a8, .Leq /* bytes equal, if zero, strings are equal */ |
| addi a3, a3, 1 /* advance s2 */ |
| bnone a2, a10, .Laligned /* if s1/s2 now aligned */ |
| l8ui a8, a2, 0 /* byte 1 from s1 */ |
| l8ui a9, a3, 0 /* byte 1 from s2 */ |
| addi a2, a2, 1 /* advance s1 */ |
| bne a8, a9, .Lretdiff /* if different, return difference */ |
| beqz a8, .Leq /* bytes equal, if zero, strings are equal */ |
| addi a3, a3, 1 /* advance s2 */ |
| bnone a2, a10, .Laligned /* if s1/s2 now aligned */ |
| l8ui a8, a2, 0 /* byte 2 from s1 */ |
| l8ui a9, a3, 0 /* byte 2 from s2 */ |
| addi a2, a2, 1 /* advance s1 */ |
| bne a8, a9, .Lretdiff /* if different, return difference */ |
| beqz a8, .Leq /* bytes equal, if zero, strings are equal */ |
| addi a3, a3, 1 /* advance s2 */ |
| j .Laligned |
| |
| /* s1 and s2 have different alignment. |
| |
| If the zero-overhead loop option is available, use an (almost) |
| infinite zero-overhead loop with conditional exits so we only pay |
| for taken branches when exiting the loop. |
| |
| Note: It is important for this unaligned case to come before the |
| code for aligned strings, because otherwise some of the branches |
| above cannot reach and have to be transformed to branches around |
| jumps. The unaligned code is smaller and the branches can reach |
| over it. */ |
| |
| .align 4 |
| /* (2 mod 4) alignment for loop instruction */ |
| .Lunaligned: |
| #if XCHAL_HAVE_LOOPS |
| _movi.n a8, 0 /* set up for the maximum loop count */ |
| loop a8, .Lretdiff /* loop forever (almost anyway) */ |
| #endif |
| .Lnextbyte: |
| l8ui a8, a2, 0 |
| l8ui a9, a3, 0 |
| addi a2, a2, 1 |
| bne a8, a9, .Lretdiff |
| addi a3, a3, 1 |
| #if XCHAL_HAVE_LOOPS |
| beqz a8, .Lretdiff |
| #else |
| bnez a8, .Lnextbyte |
| #endif |
| .Lretdiff: |
| sub a2, a8, a9 |
| retw |
| |
| /* s1 is word-aligned; s2 is word-aligned. |
| |
| If the zero-overhead loop option is available, use an (almost) |
| infinite zero-overhead loop with conditional exits so we only pay |
| for taken branches when exiting the loop. */ |
| |
| /* New algorithm, relying on the fact that all normal ASCII is between |
| 32 and 127. |
| |
| Rather than check all bytes for zero: |
| Take one word (4 bytes). Call it w1. |
| Shift w1 left by one into w1'. |
| Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't. |
| Check that all 4 bit 6's (one for each byte) are one: |
| If they are, we are definitely not done. |
| If they are not, we are probably done, but need to check for zero. */ |
| |
| .align 4 |
| #if XCHAL_HAVE_LOOPS |
| .Laligned: |
| .begin no-transform |
| l32r a4, .Lmask0 /* mask for byte 0 */ |
| l32r a7, .Lmask4 |
| /* Loop forever. (a4 is more than than the maximum number |
| of iterations) */ |
| loop a4, .Laligned_done |
| |
| /* First unrolled loop body. */ |
| l32i a8, a2, 0 /* get word from s1 */ |
| l32i a9, a3, 0 /* get word from s2 */ |
| slli a5, a8, 1 |
| bne a8, a9, .Lwne2 |
| or a9, a8, a5 |
| bnall a9, a7, .Lprobeq |
| |
| /* Second unrolled loop body. */ |
| l32i a8, a2, 4 /* get word from s1+4 */ |
| l32i a9, a3, 4 /* get word from s2+4 */ |
| slli a5, a8, 1 |
| bne a8, a9, .Lwne2 |
| or a9, a8, a5 |
| bnall a9, a7, .Lprobeq2 |
| |
| addi a2, a2, 8 /* advance s1 pointer */ |
| addi a3, a3, 8 /* advance s2 pointer */ |
| .Laligned_done: |
| or a1, a1, a1 /* nop */ |
| |
| .Lprobeq2: |
| /* Adjust pointers to account for the loop unrolling. */ |
| addi a2, a2, 4 |
| addi a3, a3, 4 |
| |
| #else /* !XCHAL_HAVE_LOOPS */ |
| |
| .Laligned: |
| movi a4, MASK0 /* mask for byte 0 */ |
| movi a7, MASK4 |
| j .Lfirstword |
| .Lnextword: |
| addi a2, a2, 4 /* advance s1 pointer */ |
| addi a3, a3, 4 /* advance s2 pointer */ |
| .Lfirstword: |
| l32i a8, a2, 0 /* get word from s1 */ |
| l32i a9, a3, 0 /* get word from s2 */ |
| slli a5, a8, 1 |
| bne a8, a9, .Lwne2 |
| or a9, a8, a5 |
| ball a9, a7, .Lnextword |
| #endif /* !XCHAL_HAVE_LOOPS */ |
| |
| /* align (0 mod 4) */ |
| .Lprobeq: |
| /* Words are probably equal, but check for sure. |
| If not, loop over the rest of string using normal algorithm. */ |
| |
| bnone a8, a4, .Leq /* if byte 0 is zero */ |
| l32r a5, .Lmask1 /* mask for byte 1 */ |
| l32r a6, .Lmask2 /* mask for byte 2 */ |
| bnone a8, a5, .Leq /* if byte 1 is zero */ |
| l32r a7, .Lmask3 /* mask for byte 3 */ |
| bnone a8, a6, .Leq /* if byte 2 is zero */ |
| bnone a8, a7, .Leq /* if byte 3 is zero */ |
| addi.n a2, a2, 4 /* advance s1 pointer */ |
| addi.n a3, a3, 4 /* advance s2 pointer */ |
| #if XCHAL_HAVE_LOOPS |
| |
| /* align (1 mod 4) */ |
| loop a4, .Leq /* loop forever (a4 is bigger than max iters) */ |
| .end no-transform |
| |
| l32i a8, a2, 0 /* get word from s1 */ |
| l32i a9, a3, 0 /* get word from s2 */ |
| addi a2, a2, 4 /* advance s1 pointer */ |
| bne a8, a9, .Lwne |
| bnone a8, a4, .Leq /* if byte 0 is zero */ |
| bnone a8, a5, .Leq /* if byte 1 is zero */ |
| bnone a8, a6, .Leq /* if byte 2 is zero */ |
| bnone a8, a7, .Leq /* if byte 3 is zero */ |
| addi a3, a3, 4 /* advance s2 pointer */ |
| |
| #else /* !XCHAL_HAVE_LOOPS */ |
| |
| j .Lfirstword2 |
| .Lnextword2: |
| addi a3, a3, 4 /* advance s2 pointer */ |
| .Lfirstword2: |
| l32i a8, a2, 0 /* get word from s1 */ |
| l32i a9, a3, 0 /* get word from s2 */ |
| addi a2, a2, 4 /* advance s1 pointer */ |
| bne a8, a9, .Lwne |
| bnone a8, a4, .Leq /* if byte 0 is zero */ |
| bnone a8, a5, .Leq /* if byte 1 is zero */ |
| bnone a8, a6, .Leq /* if byte 2 is zero */ |
| bany a8, a7, .Lnextword2 /* if byte 3 is zero */ |
| #endif /* !XCHAL_HAVE_LOOPS */ |
| |
| /* Words are equal; some byte is zero. */ |
| .Leq: movi a2, 0 /* return equal */ |
| retw |
| |
| .Lwne2: /* Words are not equal. On big-endian processors, if none of the |
| bytes are zero, the return value can be determined by a simple |
| comparison. */ |
| #ifdef __XTENSA_EB__ |
| or a10, a8, a5 |
| bnall a10, a7, .Lsomezero |
| bgeu a8, a9, .Lposreturn |
| movi a2, -1 |
| retw |
| .Lposreturn: |
| movi a2, 1 |
| retw |
| .Lsomezero: /* There is probably some zero byte. */ |
| #endif /* __XTENSA_EB__ */ |
| .Lwne: /* Words are not equal. */ |
| xor a2, a8, a9 /* get word with nonzero in byte that differs */ |
| bany a2, a4, .Ldiff0 /* if byte 0 differs */ |
| movi a5, MASK1 /* mask for byte 1 */ |
| bnone a8, a4, .Leq /* if byte 0 is zero */ |
| bany a2, a5, .Ldiff1 /* if byte 1 differs */ |
| movi a6, MASK2 /* mask for byte 2 */ |
| bnone a8, a5, .Leq /* if byte 1 is zero */ |
| bany a2, a6, .Ldiff2 /* if byte 2 differs */ |
| bnone a8, a6, .Leq /* if byte 2 is zero */ |
| #ifdef __XTENSA_EB__ |
| .Ldiff3: |
| .Ldiff2: |
| .Ldiff1: |
| /* Byte 0 is equal (at least) and there is a difference before a zero |
| byte. Just subtract words to get the return value. |
| The high order equal bytes cancel, leaving room for the sign. */ |
| sub a2, a8, a9 |
| retw |
| |
| .Ldiff0: |
| /* Need to make room for the sign, so can't subtract whole words. */ |
| extui a10, a8, 24, 8 |
| extui a11, a9, 24, 8 |
| sub a2, a10, a11 |
| retw |
| |
| #else /* !__XTENSA_EB__ */ |
| /* Little-endian is a little more difficult because can't subtract |
| whole words. */ |
| .Ldiff3: |
| /* Bytes 0-2 are equal; byte 3 is different. |
| For little-endian need to have a sign bit for the difference. */ |
| extui a10, a8, 24, 8 |
| extui a11, a9, 24, 8 |
| sub a2, a10, a11 |
| retw |
| |
| .Ldiff0: |
| /* Byte 0 is different. */ |
| extui a10, a8, 0, 8 |
| extui a11, a9, 0, 8 |
| sub a2, a10, a11 |
| retw |
| |
| .Ldiff1: |
| /* Byte 0 is equal; byte 1 is different. */ |
| extui a10, a8, 8, 8 |
| extui a11, a9, 8, 8 |
| sub a2, a10, a11 |
| retw |
| |
| .Ldiff2: |
| /* Bytes 0-1 are equal; byte 2 is different. */ |
| extui a10, a8, 16, 8 |
| extui a11, a9, 16, 8 |
| sub a2, a10, a11 |
| retw |
| |
| #endif /* !__XTENSA_EB */ |
| |
| libc_hidden_def (strcmp) |
| |
| #ifndef __UCLIBC_HAS_LOCALE__ |
| strong_alias (strcmp, strcoll) |
| libc_hidden_def (strcoll) |
| #endif |