Blame - ap/build/uClibc/libc/string/xtensa/memcpy.S - T106_DC

blob: fc04c023ee45304748cc183c12d12ce0c134713f [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* Optimized memcpy for Xtensa.
				2	Copyright (C) 2001, 2007 Free Software Foundation, Inc.
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library; if not, write to the Free
				17	Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
				18	Boston, MA 02110-1301, USA. */
				19
				20	#include "../../sysdeps/linux/xtensa/sysdep.h"
				21	#include <bits/xtensa-config.h>
				22
				23	.macro src_b r, w0, w1
				24	#ifdef __XTENSA_EB__
				25	src \r, \w0, \w1
				26	#else
				27	src \r, \w1, \w0
				28	#endif
				29	.endm
				30
				31	.macro ssa8 r
				32	#ifdef __XTENSA_EB__
				33	ssa8b \r
				34	#else
				35	ssa8l \r
				36	#endif
				37	.endm
				38
				39	/* If the Xtensa Unaligned Load Exception option is not used, this
				40	code can run a few cycles faster by relying on the low address bits
				41	being ignored. However, if the code is then run with an Xtensa ISS
				42	client that checks for unaligned accesses, it will produce a lot of
				43	warning messages. Set this flag to disable the use of unaligned
				44	accesses and keep the ISS happy. */
				45
				46	#define UNALIGNED_ADDRESSES_CHECKED 1
				47
				48	/* Do not use .literal_position in the ENTRY macro. */
				49	#undef LITERAL_POSITION
				50	#define LITERAL_POSITION
				51
				52
				53	/* void memcpy (void dst, const void *src, size_t len)
				54
				55	The algorithm is as follows:
				56
				57	If the destination is unaligned, align it by conditionally
				58	copying 1- and/or 2-byte pieces.
				59
				60	If the source is aligned, copy 16 bytes with a loop, and then finish up
				61	with 8, 4, 2, and 1-byte copies conditional on the length.
				62
				63	Else (if source is unaligned), do the same, but use SRC to align the
				64	source data.
				65
				66	This code tries to use fall-through branches for the common
				67	case of aligned source and destination and multiple of 4 (or 8) length. */
				68
				69
				70	/* Byte by byte copy. */
				71
				72	.text
				73	.align 4
				74	.literal_position
				75	__memcpy_aux:
				76
				77	/* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
				78	(0 mod 4 alignment for LBEG). */
				79	.byte 0
				80
				81	.Lbytecopy:
				82	#if XCHAL_HAVE_LOOPS
				83	loopnez a4, 2f
				84	#else
				85	beqz a4, 2f
				86	add a7, a3, a4 /* a7 = end address for source */
				87	#endif
				88	1: l8ui a6, a3, 0
				89	addi a3, a3, 1
				90	s8i a6, a5, 0
				91	addi a5, a5, 1
				92	#if !XCHAL_HAVE_LOOPS
				93	blt a3, a7, 1b
				94	#endif
				95	2: retw
				96
				97
				98	/* Destination is unaligned. */
				99
				100	.align 4
				101	.Ldst1mod2: /* dst is only byte aligned */
				102
				103	/* Do short copies byte-by-byte. */
				104	_bltui a4, 7, .Lbytecopy
				105
				106	/* Copy 1 byte. */
				107	l8ui a6, a3, 0
				108	addi a3, a3, 1
				109	addi a4, a4, -1
				110	s8i a6, a5, 0
				111	addi a5, a5, 1
				112
				113	/* Return to main algorithm if dst is now aligned. */
				114	_bbci.l a5, 1, .Ldstaligned
				115
				116	.Ldst2mod4: /* dst has 16-bit alignment */
				117
				118	/* Do short copies byte-by-byte. */
				119	_bltui a4, 6, .Lbytecopy
				120
				121	/* Copy 2 bytes. */
				122	l8ui a6, a3, 0
				123	l8ui a7, a3, 1
				124	addi a3, a3, 2
				125	addi a4, a4, -2
				126	s8i a6, a5, 0
				127	s8i a7, a5, 1
				128	addi a5, a5, 2
				129
				130	/* dst is now aligned; return to main algorithm. */
				131	j .Ldstaligned
				132
				133
				134	ENTRY (memcpy)
				135	/* a2 = dst, a3 = src, a4 = len */
				136
				137	mov a5, a2 /* copy dst so that a2 is return value */
				138	_bbsi.l a2, 0, .Ldst1mod2
				139	_bbsi.l a2, 1, .Ldst2mod4
				140	.Ldstaligned:
				141
				142	/* Get number of loop iterations with 16B per iteration. */
				143	srli a7, a4, 4
				144
				145	/* Check if source is aligned. */
				146	movi a8, 3
				147	_bany a3, a8, .Lsrcunaligned
				148
				149	/* Destination and source are word-aligned, use word copy. */
				150	#if XCHAL_HAVE_LOOPS
				151	loopnez a7, 2f
				152	#else
				153	beqz a7, 2f
				154	slli a8, a7, 4
				155	add a8, a8, a3 /* a8 = end of last 16B source chunk */
				156	#endif
				157	1: l32i a6, a3, 0
				158	l32i a7, a3, 4
				159	s32i a6, a5, 0
				160	l32i a6, a3, 8
				161	s32i a7, a5, 4
				162	l32i a7, a3, 12
				163	s32i a6, a5, 8
				164	addi a3, a3, 16
				165	s32i a7, a5, 12
				166	addi a5, a5, 16
				167	#if !XCHAL_HAVE_LOOPS
				168	blt a3, a8, 1b
				169	#endif
				170
				171	/* Copy any leftover pieces smaller than 16B. */
				172	2: bbci.l a4, 3, 3f
				173
				174	/* Copy 8 bytes. */
				175	l32i a6, a3, 0
				176	l32i a7, a3, 4
				177	addi a3, a3, 8
				178	s32i a6, a5, 0
				179	s32i a7, a5, 4
				180	addi a5, a5, 8
				181
				182	3: bbsi.l a4, 2, 4f
				183	bbsi.l a4, 1, 5f
				184	bbsi.l a4, 0, 6f
				185	retw
				186
				187	/* Copy 4 bytes. */
				188	4: l32i a6, a3, 0
				189	addi a3, a3, 4
				190	s32i a6, a5, 0
				191	addi a5, a5, 4
				192	bbsi.l a4, 1, 5f
				193	bbsi.l a4, 0, 6f
				194	retw
				195
				196	/* Copy 2 bytes. */
				197	5: l16ui a6, a3, 0
				198	addi a3, a3, 2
				199	s16i a6, a5, 0
				200	addi a5, a5, 2
				201	bbsi.l a4, 0, 6f
				202	retw
				203
				204	/* Copy 1 byte. */
				205	6: l8ui a6, a3, 0
				206	s8i a6, a5, 0
				207
				208	.Ldone:
				209	retw
				210
				211
				212	/* Destination is aligned; source is unaligned. */
				213
				214	.align 4
				215	.Lsrcunaligned:
				216	/* Avoid loading anything for zero-length copies. */
				217	_beqz a4, .Ldone
				218
				219	/* Copy 16 bytes per iteration for word-aligned dst and
				220	unaligned src. */
				221	ssa8 a3 /* set shift amount from byte offset */
				222	#if UNALIGNED_ADDRESSES_CHECKED
				223	and a11, a3, a8 /* save unalignment offset for below */
				224	sub a3, a3, a11 /* align a3 */
				225	#endif
				226	l32i a6, a3, 0 /* load first word */
				227	#if XCHAL_HAVE_LOOPS
				228	loopnez a7, 2f
				229	#else
				230	beqz a7, 2f
				231	slli a10, a7, 4
				232	add a10, a10, a3 /* a10 = end of last 16B source chunk */
				233	#endif
				234	1: l32i a7, a3, 4
				235	l32i a8, a3, 8
				236	src_b a6, a6, a7
				237	s32i a6, a5, 0
				238	l32i a9, a3, 12
				239	src_b a7, a7, a8
				240	s32i a7, a5, 4
				241	l32i a6, a3, 16
				242	src_b a8, a8, a9
				243	s32i a8, a5, 8
				244	addi a3, a3, 16
				245	src_b a9, a9, a6
				246	s32i a9, a5, 12
				247	addi a5, a5, 16
				248	#if !XCHAL_HAVE_LOOPS
				249	blt a3, a10, 1b
				250	#endif
				251
				252	2: bbci.l a4, 3, 3f
				253
				254	/* Copy 8 bytes. */
				255	l32i a7, a3, 4
				256	l32i a8, a3, 8
				257	src_b a6, a6, a7
				258	s32i a6, a5, 0
				259	addi a3, a3, 8
				260	src_b a7, a7, a8
				261	s32i a7, a5, 4
				262	addi a5, a5, 8
				263	mov a6, a8
				264
				265	3: bbci.l a4, 2, 4f
				266
				267	/* Copy 4 bytes. */
				268	l32i a7, a3, 4
				269	addi a3, a3, 4
				270	src_b a6, a6, a7
				271	s32i a6, a5, 0
				272	addi a5, a5, 4
				273	mov a6, a7
				274	4:
				275	#if UNALIGNED_ADDRESSES_CHECKED
				276	add a3, a3, a11 /* readjust a3 with correct misalignment */
				277	#endif
				278	bbsi.l a4, 1, 5f
				279	bbsi.l a4, 0, 6f
				280	retw
				281
				282	/* Copy 2 bytes. */
				283	5: l8ui a6, a3, 0
				284	l8ui a7, a3, 1
				285	addi a3, a3, 2
				286	s8i a6, a5, 0
				287	s8i a7, a5, 1
				288	addi a5, a5, 2
				289	bbsi.l a4, 0, 6f
				290	retw
				291
				292	/* Copy 1 byte. */
				293	6: l8ui a6, a3, 0
				294	s8i a6, a5, 0
				295	retw
				296
				297	libc_hidden_def (memcpy)