| /* Optimized memcpy for Xtensa. | 
 |    Copyright (C) 2001, 2007 Free Software Foundation, Inc. | 
 |    This file is part of the GNU C Library. | 
 |  | 
 |    The GNU C Library is free software; you can redistribute it and/or | 
 |    modify it under the terms of the GNU Lesser General Public | 
 |    License as published by the Free Software Foundation; either | 
 |    version 2.1 of the License, or (at your option) any later version. | 
 |  | 
 |    The GNU C Library is distributed in the hope that it will be useful, | 
 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
 |    Lesser General Public License for more details. | 
 |  | 
 |    You should have received a copy of the GNU Lesser General Public | 
 |    License along with the GNU C Library; if not, write to the Free | 
 |    Software Foundation, Inc., 51 Franklin Street - Fifth Floor, | 
 |    Boston, MA 02110-1301, USA.  */ | 
 |  | 
 | #include "../../sysdeps/linux/xtensa/sysdep.h" | 
 | #include <bits/xtensa-config.h> | 
 |  | 
 | 	.macro	src_b	r, w0, w1 | 
 | #ifdef __XTENSA_EB__ | 
 | 	src	\r, \w0, \w1 | 
 | #else | 
 | 	src	\r, \w1, \w0 | 
 | #endif | 
 | 	.endm | 
 |  | 
 | 	.macro	ssa8	r | 
 | #ifdef __XTENSA_EB__ | 
 | 	ssa8b	\r | 
 | #else | 
 | 	ssa8l	\r | 
 | #endif | 
 | 	.endm | 
 |  | 
 | /* If the Xtensa Unaligned Load Exception option is not used, this | 
 |    code can run a few cycles faster by relying on the low address bits | 
 |    being ignored.  However, if the code is then run with an Xtensa ISS | 
 |    client that checks for unaligned accesses, it will produce a lot of | 
 |    warning messages.  Set this flag to disable the use of unaligned | 
 |    accesses and keep the ISS happy.  */ | 
 |  | 
 | #define UNALIGNED_ADDRESSES_CHECKED 1 | 
 |  | 
 | /* Do not use .literal_position in the ENTRY macro.  */ | 
 | #undef LITERAL_POSITION | 
 | #define LITERAL_POSITION | 
 |  | 
 |  | 
 | /* void *memcpy (void *dst, const void *src, size_t len) | 
 |  | 
 |    The algorithm is as follows: | 
 |  | 
 |    If the destination is unaligned, align it by conditionally | 
 |    copying 1- and/or 2-byte pieces. | 
 |  | 
 |    If the source is aligned, copy 16 bytes with a loop, and then finish up | 
 |    with 8, 4, 2, and 1-byte copies conditional on the length. | 
 |  | 
 |    Else (if source is unaligned), do the same, but use SRC to align the | 
 |    source data. | 
 |  | 
 |    This code tries to use fall-through branches for the common | 
 |    case of aligned source and destination and multiple of 4 (or 8) length.  */ | 
 |  | 
 |  | 
 | /* Byte by byte copy.  */ | 
 |  | 
 | 	.text | 
 | 	.align	4 | 
 | 	.literal_position | 
 | __memcpy_aux: | 
 |  | 
 | 	/* Skip a byte to get 1 mod 4 alignment for LOOPNEZ | 
 | 	   (0 mod 4 alignment for LBEG).  */ | 
 | 	.byte	0 | 
 |  | 
 | .Lbytecopy: | 
 | #if XCHAL_HAVE_LOOPS | 
 | 	loopnez	a4, 2f | 
 | #else | 
 | 	beqz	a4, 2f | 
 | 	add	a7, a3, a4	/* a7 = end address for source */ | 
 | #endif | 
 | 1:	l8ui	a6, a3, 0 | 
 | 	addi	a3, a3, 1 | 
 | 	s8i	a6, a5, 0 | 
 | 	addi	a5, a5, 1 | 
 | #if !XCHAL_HAVE_LOOPS | 
 | 	blt	a3, a7, 1b | 
 | #endif | 
 | 2:	retw | 
 |  | 
 |  | 
 | /* Destination is unaligned.  */ | 
 |  | 
 | 	.align	4 | 
 | .Ldst1mod2: /* dst is only byte aligned */ | 
 |  | 
 | 	/* Do short copies byte-by-byte.  */ | 
 | 	_bltui	a4, 7, .Lbytecopy | 
 |  | 
 | 	/* Copy 1 byte.  */ | 
 | 	l8ui	a6, a3, 0 | 
 | 	addi	a3, a3, 1 | 
 | 	addi	a4, a4, -1 | 
 | 	s8i	a6, a5, 0 | 
 | 	addi	a5, a5, 1 | 
 |  | 
 | 	/* Return to main algorithm if dst is now aligned.  */ | 
 | 	_bbci.l	a5, 1, .Ldstaligned | 
 |  | 
 | .Ldst2mod4: /* dst has 16-bit alignment */ | 
 |  | 
 | 	/* Do short copies byte-by-byte.  */ | 
 | 	_bltui	a4, 6, .Lbytecopy | 
 |  | 
 | 	/* Copy 2 bytes.  */ | 
 | 	l8ui	a6, a3, 0 | 
 | 	l8ui	a7, a3, 1 | 
 | 	addi	a3, a3, 2 | 
 | 	addi	a4, a4, -2 | 
 | 	s8i	a6, a5, 0 | 
 | 	s8i	a7, a5, 1 | 
 | 	addi	a5, a5, 2 | 
 |  | 
 | 	/* dst is now aligned; return to main algorithm.  */ | 
 | 	j	.Ldstaligned | 
 |  | 
 |  | 
 | ENTRY (memcpy) | 
 | 	/* a2 = dst, a3 = src, a4 = len */ | 
 |  | 
 | 	mov	a5, a2		/* copy dst so that a2 is return value */ | 
 | 	_bbsi.l	a2, 0, .Ldst1mod2 | 
 | 	_bbsi.l	a2, 1, .Ldst2mod4 | 
 | .Ldstaligned: | 
 |  | 
 | 	/* Get number of loop iterations with 16B per iteration.  */ | 
 | 	srli	a7, a4, 4 | 
 |  | 
 | 	/* Check if source is aligned.  */ | 
 | 	movi	a8, 3 | 
 | 	_bany	a3, a8, .Lsrcunaligned | 
 |  | 
 | 	/* Destination and source are word-aligned, use word copy.  */ | 
 | #if XCHAL_HAVE_LOOPS | 
 | 	loopnez	a7, 2f | 
 | #else | 
 | 	beqz	a7, 2f | 
 | 	slli	a8, a7, 4 | 
 | 	add	a8, a8, a3	/* a8 = end of last 16B source chunk */ | 
 | #endif | 
 | 1:	l32i	a6, a3, 0 | 
 | 	l32i	a7, a3, 4 | 
 | 	s32i	a6, a5, 0 | 
 | 	l32i	a6, a3, 8 | 
 | 	s32i	a7, a5, 4 | 
 | 	l32i	a7, a3, 12 | 
 | 	s32i	a6, a5, 8 | 
 | 	addi	a3, a3, 16 | 
 | 	s32i	a7, a5, 12 | 
 | 	addi	a5, a5, 16 | 
 | #if !XCHAL_HAVE_LOOPS | 
 | 	blt	a3, a8, 1b | 
 | #endif | 
 |  | 
 | 	/* Copy any leftover pieces smaller than 16B.  */ | 
 | 2:	bbci.l	a4, 3, 3f | 
 |  | 
 | 	/* Copy 8 bytes.  */ | 
 | 	l32i	a6, a3, 0 | 
 | 	l32i	a7, a3, 4 | 
 | 	addi	a3, a3, 8 | 
 | 	s32i	a6, a5, 0 | 
 | 	s32i	a7, a5, 4 | 
 | 	addi	a5, a5, 8 | 
 |  | 
 | 3:	bbsi.l	a4, 2, 4f | 
 | 	bbsi.l	a4, 1, 5f | 
 | 	bbsi.l	a4, 0, 6f | 
 | 	retw | 
 |  | 
 | 	/* Copy 4 bytes.  */ | 
 | 4:	l32i	a6, a3, 0 | 
 | 	addi	a3, a3, 4 | 
 | 	s32i	a6, a5, 0 | 
 | 	addi	a5, a5, 4 | 
 | 	bbsi.l	a4, 1, 5f | 
 | 	bbsi.l	a4, 0, 6f | 
 | 	retw | 
 |  | 
 | 	/* Copy 2 bytes.  */ | 
 | 5:	l16ui	a6, a3, 0 | 
 | 	addi	a3, a3, 2 | 
 | 	s16i	a6, a5, 0 | 
 | 	addi	a5, a5, 2 | 
 | 	bbsi.l	a4, 0, 6f | 
 | 	retw | 
 |  | 
 | 	/* Copy 1 byte.  */ | 
 | 6:	l8ui	a6, a3, 0 | 
 | 	s8i	a6, a5, 0 | 
 |  | 
 | .Ldone: | 
 | 	retw | 
 |  | 
 |  | 
 | /* Destination is aligned; source is unaligned.  */ | 
 |  | 
 | 	.align	4 | 
 | .Lsrcunaligned: | 
 | 	/* Avoid loading anything for zero-length copies.  */ | 
 | 	_beqz	a4, .Ldone | 
 |  | 
 | 	/* Copy 16 bytes per iteration for word-aligned dst and | 
 | 	   unaligned src.  */ | 
 | 	ssa8	a3		/* set shift amount from byte offset */ | 
 | #if UNALIGNED_ADDRESSES_CHECKED | 
 | 	and	a11, a3, a8	/* save unalignment offset for below */ | 
 | 	sub	a3, a3, a11	/* align a3 */ | 
 | #endif | 
 | 	l32i	a6, a3, 0	/* load first word */ | 
 | #if XCHAL_HAVE_LOOPS | 
 | 	loopnez	a7, 2f | 
 | #else | 
 | 	beqz	a7, 2f | 
 | 	slli	a10, a7, 4 | 
 | 	add	a10, a10, a3	/* a10 = end of last 16B source chunk */ | 
 | #endif | 
 | 1:	l32i	a7, a3, 4 | 
 | 	l32i	a8, a3, 8 | 
 | 	src_b	a6, a6, a7 | 
 | 	s32i	a6, a5, 0 | 
 | 	l32i	a9, a3, 12 | 
 | 	src_b	a7, a7, a8 | 
 | 	s32i	a7, a5, 4 | 
 | 	l32i	a6, a3, 16 | 
 | 	src_b	a8, a8, a9 | 
 | 	s32i	a8, a5, 8 | 
 | 	addi	a3, a3, 16 | 
 | 	src_b	a9, a9, a6 | 
 | 	s32i	a9, a5, 12 | 
 | 	addi	a5, a5, 16 | 
 | #if !XCHAL_HAVE_LOOPS | 
 | 	blt	a3, a10, 1b | 
 | #endif | 
 |  | 
 | 2:	bbci.l	a4, 3, 3f | 
 |  | 
 | 	/* Copy 8 bytes.  */ | 
 | 	l32i	a7, a3, 4 | 
 | 	l32i	a8, a3, 8 | 
 | 	src_b	a6, a6, a7 | 
 | 	s32i	a6, a5, 0 | 
 | 	addi	a3, a3, 8 | 
 | 	src_b	a7, a7, a8 | 
 | 	s32i	a7, a5, 4 | 
 | 	addi	a5, a5, 8 | 
 | 	mov	a6, a8 | 
 |  | 
 | 3:	bbci.l	a4, 2, 4f | 
 |  | 
 | 	/* Copy 4 bytes.  */ | 
 | 	l32i	a7, a3, 4 | 
 | 	addi	a3, a3, 4 | 
 | 	src_b	a6, a6, a7 | 
 | 	s32i	a6, a5, 0 | 
 | 	addi	a5, a5, 4 | 
 | 	mov	a6, a7 | 
 | 4: | 
 | #if UNALIGNED_ADDRESSES_CHECKED | 
 | 	add	a3, a3, a11	/* readjust a3 with correct misalignment */ | 
 | #endif | 
 | 	bbsi.l	a4, 1, 5f | 
 | 	bbsi.l	a4, 0, 6f | 
 | 	retw | 
 |  | 
 | 	/* Copy 2 bytes.  */ | 
 | 5:	l8ui	a6, a3, 0 | 
 | 	l8ui	a7, a3, 1 | 
 | 	addi	a3, a3, 2 | 
 | 	s8i	a6, a5, 0 | 
 | 	s8i	a7, a5, 1 | 
 | 	addi	a5, a5, 2 | 
 | 	bbsi.l	a4, 0, 6f | 
 | 	retw | 
 |  | 
 | 	/* Copy 1 byte.  */ | 
 | 6:	l8ui	a6, a3, 0 | 
 | 	s8i	a6, a5, 0 | 
 | 	retw | 
 |  | 
 | libc_hidden_def (memcpy) |