[T106][ZXW-22]7520V3SCV2.01.01.02P42U09_VEC_V0.8_AP_VEC origin source commit

Change-Id: Ic6e05d89ecd62fc34f82b23dcf306c93764aec4b
diff --git a/ap/build/uClibc/libc/string/xtensa/strcmp.S b/ap/build/uClibc/libc/string/xtensa/strcmp.S
new file mode 100644
index 0000000..ac058a2
--- /dev/null
+++ b/ap/build/uClibc/libc/string/xtensa/strcmp.S
@@ -0,0 +1,314 @@
+/* Optimized strcmp for Xtensa.
+   Copyright (C) 2001, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <bits/xtensa-config.h>
+#include <features.h>
+
+#ifdef __XTENSA_EB__
+#define	MASK0 0xff000000
+#define	MASK1 0x00ff0000
+#define	MASK2 0x0000ff00
+#define	MASK3 0x000000ff
+#else
+#define	MASK0 0x000000ff
+#define	MASK1 0x0000ff00
+#define	MASK2 0x00ff0000
+#define	MASK3 0xff000000
+#endif
+
+#define MASK4 0x40404040
+
+	.literal .Lmask0, MASK0
+	.literal .Lmask1, MASK1
+	.literal .Lmask2, MASK2
+	.literal .Lmask3, MASK3
+	.literal .Lmask4, MASK4
+
+	.text
+ENTRY (strcmp)
+	/* a2 = s1, a3 = s2 */
+
+	l8ui	a8, a2, 0	/* byte 0 from s1 */
+	l8ui	a9, a3, 0	/* byte 0 from s2 */
+	movi	a10, 3		/* mask */
+	bne	a8, a9, .Lretdiff
+
+	or	a11, a2, a3
+	bnone	a11, a10, .Laligned
+
+	xor	a11, a2, a3	/* compare low two bits of s1 and s2 */
+	bany	a11, a10, .Lunaligned	/* if they have different alignment */
+
+	/* s1/s2 are not word-aligned.  */
+	addi	a2, a2, 1	/* advance s1 */
+	beqz	a8, .Leq	/* bytes equal, if zero, strings are equal */
+	addi	a3, a3, 1	/* advance s2 */
+	bnone	a2, a10, .Laligned /* if s1/s2 now aligned */
+	l8ui	a8, a2, 0	/* byte 1 from s1 */
+	l8ui	a9, a3, 0	/* byte 1 from s2 */
+	addi	a2, a2, 1	/* advance s1 */
+	bne	a8, a9, .Lretdiff /* if different, return difference */
+	beqz	a8, .Leq	/* bytes equal, if zero, strings are equal */
+	addi	a3, a3, 1	/* advance s2 */
+	bnone	a2, a10, .Laligned /* if s1/s2 now aligned */
+	l8ui	a8, a2, 0	/* byte 2 from s1 */
+	l8ui	a9, a3, 0	/* byte 2 from s2 */
+	addi	a2, a2, 1	/* advance s1 */
+	bne	a8, a9, .Lretdiff /* if different, return difference */
+	beqz	a8, .Leq	/* bytes equal, if zero, strings are equal */
+	addi	a3, a3, 1	/* advance s2 */
+	j	.Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+	.align	4
+	/* (2 mod 4) alignment for loop instruction */
+.Lunaligned:
+#if XCHAL_HAVE_LOOPS
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, .Lretdiff	/* loop forever (almost anyway) */
+#endif
+.Lnextbyte:
+	l8ui	a8, a2, 0
+	l8ui	a9, a3, 0
+	addi	a2, a2, 1
+	bne	a8, a9, .Lretdiff
+	addi	a3, a3, 1
+#if XCHAL_HAVE_LOOPS
+	beqz	a8, .Lretdiff
+#else
+	bnez	a8, .Lnextbyte
+#endif
+.Lretdiff:
+	sub	a2, a8, a9
+	retw
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+	.align	4
+#if XCHAL_HAVE_LOOPS
+.Laligned:
+	.begin	no-transform
+	l32r	a4, .Lmask0	/* mask for byte 0 */
+	l32r	a7, .Lmask4
+	/* Loop forever.  (a4 is more than than the maximum number
+	   of iterations) */
+	loop	a4, .Laligned_done
+
+	/* First unrolled loop body.  */
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
+	slli	a5, a8, 1
+	bne	a8, a9, .Lwne2
+	or	a9, a8, a5
+	bnall	a9, a7, .Lprobeq
+
+	/* Second unrolled loop body.  */
+	l32i	a8, a2, 4	/* get word from s1+4 */
+	l32i	a9, a3, 4	/* get word from s2+4 */
+	slli	a5, a8, 1
+	bne	a8, a9, .Lwne2
+	or	a9, a8, a5
+	bnall	a9, a7, .Lprobeq2
+
+	addi	a2, a2, 8	/* advance s1 pointer */
+	addi	a3, a3, 8	/* advance s2 pointer */
+.Laligned_done:
+	or	a1, a1, a1	/* nop */
+
+.Lprobeq2:
+	/* Adjust pointers to account for the loop unrolling.  */
+	addi	a2, a2, 4
+	addi	a3, a3, 4
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+.Laligned:
+	movi	a4, MASK0	/* mask for byte 0 */
+	movi	a7, MASK4
+	j	.Lfirstword
+.Lnextword:
+	addi	a2, a2, 4	/* advance s1 pointer */
+	addi	a3, a3, 4	/* advance s2 pointer */
+.Lfirstword:
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
+	slli	a5, a8, 1
+	bne	a8, a9, .Lwne2
+	or	a9, a8, a5
+	ball	a9, a7, .Lnextword
+#endif /* !XCHAL_HAVE_LOOPS */
+
+	/* align (0 mod 4) */
+.Lprobeq:
+	/* Words are probably equal, but check for sure.
+	   If not, loop over the rest of string using normal algorithm.  */
+
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	l32r	a5, .Lmask1	/* mask for byte 1 */
+	l32r	a6, .Lmask2	/* mask for byte 2 */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	l32r	a7, .Lmask3	/* mask for byte 3 */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+	bnone	a8, a7, .Leq	/* if byte 3 is zero */
+	addi.n	a2, a2, 4	/* advance s1 pointer */
+	addi.n	a3, a3, 4	/* advance s2 pointer */
+#if XCHAL_HAVE_LOOPS
+
+	/* align (1 mod 4) */
+	loop	a4, .Leq	/* loop forever (a4 is bigger than max iters) */
+	.end	no-transform
+
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
+	addi	a2, a2, 4	/* advance s1 pointer */
+	bne	a8, a9, .Lwne
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+	bnone	a8, a7, .Leq	/* if byte 3 is zero */
+	addi	a3, a3, 4	/* advance s2 pointer */
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+	j	.Lfirstword2
+.Lnextword2:
+	addi	a3, a3, 4	/* advance s2 pointer */
+.Lfirstword2:
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
+	addi	a2, a2, 4	/* advance s1 pointer */
+	bne	a8, a9, .Lwne
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+	bany	a8, a7, .Lnextword2	/* if byte 3 is zero */
+#endif /* !XCHAL_HAVE_LOOPS */
+
+	/* Words are equal; some byte is zero.  */
+.Leq:	movi	a2, 0		/* return equal */
+	retw
+
+.Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
+	   bytes are zero, the return value can be determined by a simple
+	   comparison.  */
+#ifdef __XTENSA_EB__
+	or	a10, a8, a5
+	bnall	a10, a7, .Lsomezero
+	bgeu	a8, a9, .Lposreturn
+	movi	a2, -1
+	retw
+.Lposreturn:
+	movi	a2, 1
+	retw
+.Lsomezero:	/* There is probably some zero byte. */
+#endif /* __XTENSA_EB__ */
+.Lwne:	/* Words are not equal.  */
+	xor	a2, a8, a9	/* get word with nonzero in byte that differs */
+	bany	a2, a4, .Ldiff0	/* if byte 0 differs */
+	movi	a5, MASK1	/* mask for byte 1 */
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	bany	a2, a5, .Ldiff1	/* if byte 1 differs */
+	movi	a6, MASK2	/* mask for byte 2 */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	bany	a2, a6, .Ldiff2	/* if byte 2 differs */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+#ifdef __XTENSA_EB__
+.Ldiff3:
+.Ldiff2:
+.Ldiff1:
+	/* Byte 0 is equal (at least) and there is a difference before a zero
+	   byte.  Just subtract words to get the return value.
+	   The high order equal bytes cancel, leaving room for the sign.  */
+	sub	a2, a8, a9
+	retw
+
+.Ldiff0:
+	/* Need to make room for the sign, so can't subtract whole words.  */
+	extui	a10, a8, 24, 8
+	extui	a11, a9, 24, 8
+	sub	a2, a10, a11
+	retw
+
+#else /* !__XTENSA_EB__ */
+	/* Little-endian is a little more difficult because can't subtract
+	   whole words.  */
+.Ldiff3:
+	/* Bytes 0-2 are equal; byte 3 is different.
+	   For little-endian need to have a sign bit for the difference.  */
+	extui	a10, a8, 24, 8
+	extui	a11, a9, 24, 8
+	sub	a2, a10, a11
+	retw
+
+.Ldiff0:
+	/* Byte 0 is different.  */
+	extui	a10, a8, 0, 8
+	extui	a11, a9, 0, 8
+	sub	a2, a10, a11
+	retw
+
+.Ldiff1:
+	/* Byte 0 is equal; byte 1 is different.  */
+	extui	a10, a8, 8, 8
+	extui	a11, a9, 8, 8
+	sub	a2, a10, a11
+	retw
+
+.Ldiff2:
+	/* Bytes 0-1 are equal; byte 2 is different.  */
+	extui	a10, a8, 16, 8
+	extui	a11, a9, 16, 8
+	sub	a2, a10, a11
+	retw
+
+#endif /* !__XTENSA_EB */
+
+libc_hidden_def (strcmp)
+
+#ifndef __UCLIBC_HAS_LOCALE__
+strong_alias (strcmp, strcoll)
+libc_hidden_def (strcoll)
+#endif