Blame - ap/build/uClibc/libc/string/arm/_memcpy.S - R306

blob: b26080d0281dc4de48b75661a146a32c84b6d6e7 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*-
				2	* Copyright (c) 1997 The NetBSD Foundation, Inc.
				3	* All rights reserved.
				4	*
				5	* This code is derived from software contributed to The NetBSD Foundation
				6	* by Neil A. Carson and Mark Brinicombe
				7	*
				8	* Redistribution and use in source and binary forms, with or without
				9	* modification, are permitted provided that the following conditions
				10	* are met:
				11	* 1. Redistributions of source code must retain the above copyright
				12	* notice, this list of conditions and the following disclaimer.
				13	* 2. Redistributions in binary form must reproduce the above copyright
				14	* notice, this list of conditions and the following disclaimer in the
				15	* documentation and/or other materials provided with the distribution.
				16	* 3. All advertising materials mentioning features or use of this software
				17	* must display the following acknowledgement:
				18	* This product includes software developed by the NetBSD
				19	* Foundation, Inc. and its contributors.
				20	* 4. Neither the name of The NetBSD Foundation nor the names of its
				21	* contributors may be used to endorse or promote products derived
				22	* from this software without specific prior written permission.
				23	*
				24	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
				25	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
				26	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
				27	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
				28	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				29	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				30	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				31	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				32	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				33	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				34	* POSSIBILITY OF SUCH DAMAGE.
				35	*
				36	* Adapted for uClibc from NetBSD _memcpy.S,v 1.6 2003/10/09
				37	* by Erik Andersen <andersen@codepoet.org>
				38	*/
				39
				40	#include <features.h>
				41	#include <endian.h>
				42	#include <bits/arm_asm.h>
				43
				44	#if !defined(THUMB1_ONLY)
				45	/*
				46	* This is one fun bit of code ...
				47	* Some easy listening music is suggested while trying to understand this
				48	* code e.g. Iron Maiden
				49	*
				50	* For anyone attempting to understand it :
				51	*
				52	* The core code is implemented here with simple stubs for memcpy()
				53	* memmove() and bcopy().
				54	*
				55	* All local labels are prefixed with Lmemcpy_
				56	* Following the prefix a label starting f is used in the forward copy code
				57	* while a label using b is used in the backwards copy code
				58	* The source and destination addresses determine whether a forward or
				59	* backward copy is performed.
				60	* Separate bits of code are used to deal with the following situations
				61	* for both the forward and backwards copy.
				62	* unaligned source address
				63	* unaligned destination address
				64	* Separate copy routines are used to produce an optimised result for each
				65	* of these cases.
				66	* The copy code will use LDM/STM instructions to copy up to 32 bytes at
				67	* a time where possible.
				68	*
				69	* Note: r12 (aka ip) can be trashed during the function along with
				70	* r0-r3 although r0-r2 have defined uses i.e. dest, src, len throughout.
				71	* Additional registers are preserved prior to use i.e. r4, r5 & lr
				72	* The return value in r0 must be the destination address.
				73	*
				74	* Apologies for the state of the comments ;-)
				75	*/
				76
				77	.text
				78	.global _memcpy
				79	.hidden _memcpy
				80	.type _memcpy,%function
				81	.align 4
				82
				83	/* XXX: The Thumb-2 conditionals can be removed if/when we require an
				84	assembler that supports unified syntax. */
				85	.macro copy regs
				86	#if defined(__thumb2__)
				87	ittt ge
				88	ldmiage r1!, \regs
				89	stmiage r0!, \regs
				90	#else
				91	ldmgeia r1!, \regs
				92	stmgeia r0!, \regs
				93	#endif
				94	.endm
				95
				96	.macro copydb regs
				97	#if defined(__thumb2__)
				98	ittt ge
				99	ldmdbge r1!, \regs
				100	stmdbge r0!, \regs
				101	#else
				102	ldmgedb r1!, \regs
				103	stmgedb r0!, \regs
				104	#endif
				105	.endm
				106
				107	_memcpy:
				108	/* Determine copy direction */
				109	cmp r1, r0
				110	bcc .Lmemcpy_backwards
				111
				112	IT(t, eq) /* Quick abort for src=dst */
				113	#if defined(__USE_BX__)
				114	bxeq lr
				115	#else
				116	moveq pc, lr
				117	#endif
				118	stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
				119	subs r2, r2, #4
				120	blt .Lmemcpy_fl4 /* less than 4 bytes */
				121	ands r12, r0, #3
				122	bne .Lmemcpy_fdestul /* oh unaligned destination addr */
				123	ands r12, r1, #3
				124	bne .Lmemcpy_fsrcul /* oh unaligned source addr */
				125
				126	.Lmemcpy_ft8:
				127	/* We have aligned source and destination */
				128	subs r2, r2, #8
				129	blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
				130	subs r2, r2, #0x14
				131	blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
				132	str r4, [sp, #-4]! /* borrow r4 */
				133
				134	/* blat 32 bytes at a time */
				135	/* XXX for really big copies perhaps we should use more registers */
				136	.Lmemcpy_floop32:
				137	ldmia r1!, {r3, r4, r12, lr}
				138	stmia r0!, {r3, r4, r12, lr}
				139	ldmia r1!, {r3, r4, r12, lr}
				140	stmia r0!, {r3, r4, r12, lr}
				141	subs r2, r2, #0x20
				142	bge .Lmemcpy_floop32
				143
				144	cmn r2, #0x10
				145	/* blat a remaining 16 bytes */
				146	copy "{r3, r4, r12, lr}"
				147	subge r2, r2, #0x10
				148	ldr r4, [sp], #4 /* restore r4 */
				149
				150	.Lmemcpy_fl32:
				151	adds r2, r2, #0x14
				152
				153	/* blat 12 bytes at a time */
				154	.Lmemcpy_floop12:
				155	copy "{r3, r12, lr}"
				156	#if defined(__thumb2__)
				157	subsge r2, r2, #0x0c
				158	#else
				159	subges r2, r2, #0x0c
				160	#endif
				161	bge .Lmemcpy_floop12
				162
				163	.Lmemcpy_fl12:
				164	adds r2, r2, #8
				165	blt .Lmemcpy_fl4
				166
				167	subs r2, r2, #4
				168	IT(tt, lt)
				169	ldrlt r3, [r1], #4
				170	strlt r3, [r0], #4
				171	copy "{r3, r12}"
				172	subge r2, r2, #4
				173
				174	.Lmemcpy_fl4:
				175	/* less than 4 bytes to go */
				176	adds r2, r2, #4
				177	#if defined(__thumb2__)
				178	it eq
				179	popeq {r0, pc} /* done */
				180	#elif defined(__ARM_ARCH_4T__)
				181	ldmeqia sp!, {r0, r3} /* done */
				182	bxeq r3
				183	#else
				184	ldmeqia sp!, {r0, pc} /* done */
				185	#endif
				186
				187	/* copy the crud byte at a time */
				188	cmp r2, #2
				189	ldrb r3, [r1], #1
				190	strb r3, [r0], #1
				191	#if defined(__thumb2__)
				192	itt ge
				193	ldrbge r3, [r1], #1
				194	strbge r3, [r0], #1
				195	itt gt
				196	ldrbgt r3, [r1], #1
				197	strbgt r3, [r0], #1
				198	#else
				199	ldrgeb r3, [r1], #1
				200	strgeb r3, [r0], #1
				201	ldrgtb r3, [r1], #1
				202	strgtb r3, [r0], #1
				203	#endif
				204	#if defined(__ARM_ARCH_4T__)
				205	ldmia sp!, {r0, r3}
				206	bx r3
				207	#else
				208	ldmia sp!, {r0, pc}
				209	#endif
				210
				211	/* erg - unaligned destination */
				212	.Lmemcpy_fdestul:
				213	rsb r12, r12, #4
				214	cmp r12, #2
				215
				216	/* align destination with byte copies */
				217	ldrb r3, [r1], #1
				218	strb r3, [r0], #1
				219	#if defined(__thumb2__)
				220	itt ge
				221	ldrbge r3, [r1], #1
				222	strbge r3, [r0], #1
				223	itt gt
				224	ldrbgt r3, [r1], #1
				225	strbgt r3, [r0], #1
				226	#else
				227	ldrgeb r3, [r1], #1
				228	strgeb r3, [r0], #1
				229	ldrgtb r3, [r1], #1
				230	strgtb r3, [r0], #1
				231	#endif
				232	subs r2, r2, r12
				233	blt .Lmemcpy_fl4 /* less the 4 bytes */
				234
				235	ands r12, r1, #3
				236	beq .Lmemcpy_ft8 /* we have an aligned source */
				237
				238	/* erg - unaligned source */
				239	/* This is where it gets nasty ... */
				240	.Lmemcpy_fsrcul:
				241	bic r1, r1, #3
				242	ldr lr, [r1], #4
				243	cmp r12, #2
				244	bgt .Lmemcpy_fsrcul3
				245	beq .Lmemcpy_fsrcul2
				246	cmp r2, #0x0c
				247	blt .Lmemcpy_fsrcul1loop4
				248	sub r2, r2, #0x0c
				249	stmdb sp!, {r4, r5}
				250
				251	.Lmemcpy_fsrcul1loop16:
				252	#if __BYTE_ORDER == __BIG_ENDIAN
				253	mov r3, lr, lsl #8
				254	ldmia r1!, {r4, r5, r12, lr}
				255	orr r3, r3, r4, lsr #24
				256	mov r4, r4, lsl #8
				257	orr r4, r4, r5, lsr #24
				258	mov r5, r5, lsl #8
				259	orr r5, r5, r12, lsr #24
				260	mov r12, r12, lsl #8
				261	orr r12, r12, lr, lsr #24
				262	#else
				263	mov r3, lr, lsr #8
				264	ldmia r1!, {r4, r5, r12, lr}
				265	orr r3, r3, r4, lsl #24
				266	mov r4, r4, lsr #8
				267	orr r4, r4, r5, lsl #24
				268	mov r5, r5, lsr #8
				269	orr r5, r5, r12, lsl #24
				270	mov r12, r12, lsr #8
				271	orr r12, r12, lr, lsl #24
				272	#endif
				273	stmia r0!, {r3-r5, r12}
				274	subs r2, r2, #0x10
				275	bge .Lmemcpy_fsrcul1loop16
				276	ldmia sp!, {r4, r5}
				277	adds r2, r2, #0x0c
				278	blt .Lmemcpy_fsrcul1l4
				279
				280	.Lmemcpy_fsrcul1loop4:
				281	#if __BYTE_ORDER == __BIG_ENDIAN
				282	mov r12, lr, lsl #8
				283	ldr lr, [r1], #4
				284	orr r12, r12, lr, lsr #24
				285	#else
				286	mov r12, lr, lsr #8
				287	ldr lr, [r1], #4
				288	orr r12, r12, lr, lsl #24
				289	#endif
				290	str r12, [r0], #4
				291	subs r2, r2, #4
				292	bge .Lmemcpy_fsrcul1loop4
				293
				294	.Lmemcpy_fsrcul1l4:
				295	sub r1, r1, #3
				296	b .Lmemcpy_fl4
				297
				298	.Lmemcpy_fsrcul2:
				299	cmp r2, #0x0c
				300	blt .Lmemcpy_fsrcul2loop4
				301	sub r2, r2, #0x0c
				302	stmdb sp!, {r4, r5}
				303
				304	.Lmemcpy_fsrcul2loop16:
				305	#if __BYTE_ORDER == __BIG_ENDIAN
				306	mov r3, lr, lsl #16
				307	ldmia r1!, {r4, r5, r12, lr}
				308	orr r3, r3, r4, lsr #16
				309	mov r4, r4, lsl #16
				310	orr r4, r4, r5, lsr #16
				311	mov r5, r5, lsl #16
				312	orr r5, r5, r12, lsr #16
				313	mov r12, r12, lsl #16
				314	orr r12, r12, lr, lsr #16
				315	#else
				316	mov r3, lr, lsr #16
				317	ldmia r1!, {r4, r5, r12, lr}
				318	orr r3, r3, r4, lsl #16
				319	mov r4, r4, lsr #16
				320	orr r4, r4, r5, lsl #16
				321	mov r5, r5, lsr #16
				322	orr r5, r5, r12, lsl #16
				323	mov r12, r12, lsr #16
				324	orr r12, r12, lr, lsl #16
				325	#endif
				326	stmia r0!, {r3-r5, r12}
				327	subs r2, r2, #0x10
				328	bge .Lmemcpy_fsrcul2loop16
				329	ldmia sp!, {r4, r5}
				330	adds r2, r2, #0x0c
				331	blt .Lmemcpy_fsrcul2l4
				332
				333	.Lmemcpy_fsrcul2loop4:
				334	#if __BYTE_ORDER == __BIG_ENDIAN
				335	mov r12, lr, lsl #16
				336	ldr lr, [r1], #4
				337	orr r12, r12, lr, lsr #16
				338	#else
				339	mov r12, lr, lsr #16
				340	ldr lr, [r1], #4
				341	orr r12, r12, lr, lsl #16
				342	#endif
				343	str r12, [r0], #4
				344	subs r2, r2, #4
				345	bge .Lmemcpy_fsrcul2loop4
				346
				347	.Lmemcpy_fsrcul2l4:
				348	sub r1, r1, #2
				349	b .Lmemcpy_fl4
				350
				351	.Lmemcpy_fsrcul3:
				352	cmp r2, #0x0c
				353	blt .Lmemcpy_fsrcul3loop4
				354	sub r2, r2, #0x0c
				355	stmdb sp!, {r4, r5}
				356
				357	.Lmemcpy_fsrcul3loop16:
				358	#if __BYTE_ORDER == __BIG_ENDIAN
				359	mov r3, lr, lsl #24
				360	ldmia r1!, {r4, r5, r12, lr}
				361	orr r3, r3, r4, lsr #8
				362	mov r4, r4, lsl #24
				363	orr r4, r4, r5, lsr #8
				364	mov r5, r5, lsl #24
				365	orr r5, r5, r12, lsr #8
				366	mov r12, r12, lsl #24
				367	orr r12, r12, lr, lsr #8
				368	#else
				369	mov r3, lr, lsr #24
				370	ldmia r1!, {r4, r5, r12, lr}
				371	orr r3, r3, r4, lsl #8
				372	mov r4, r4, lsr #24
				373	orr r4, r4, r5, lsl #8
				374	mov r5, r5, lsr #24
				375	orr r5, r5, r12, lsl #8
				376	mov r12, r12, lsr #24
				377	orr r12, r12, lr, lsl #8
				378	#endif
				379	stmia r0!, {r3-r5, r12}
				380	subs r2, r2, #0x10
				381	bge .Lmemcpy_fsrcul3loop16
				382	ldmia sp!, {r4, r5}
				383	adds r2, r2, #0x0c
				384	blt .Lmemcpy_fsrcul3l4
				385
				386	.Lmemcpy_fsrcul3loop4:
				387	#if __BYTE_ORDER == __BIG_ENDIAN
				388	mov r12, lr, lsl #24
				389	ldr lr, [r1], #4
				390	orr r12, r12, lr, lsr #8
				391	#else
				392	mov r12, lr, lsr #24
				393	ldr lr, [r1], #4
				394	orr r12, r12, lr, lsl #8
				395	#endif
				396	str r12, [r0], #4
				397	subs r2, r2, #4
				398	bge .Lmemcpy_fsrcul3loop4
				399
				400	.Lmemcpy_fsrcul3l4:
				401	sub r1, r1, #1
				402	b .Lmemcpy_fl4
				403
				404	.Lmemcpy_backwards:
				405	add r1, r1, r2
				406	add r0, r0, r2
				407	subs r2, r2, #4
				408	blt .Lmemcpy_bl4 /* less than 4 bytes */
				409	ands r12, r0, #3
				410	bne .Lmemcpy_bdestul /* oh unaligned destination addr */
				411	ands r12, r1, #3
				412	bne .Lmemcpy_bsrcul /* oh unaligned source addr */
				413
				414	.Lmemcpy_bt8:
				415	/* We have aligned source and destination */
				416	subs r2, r2, #8
				417	blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
				418	stmdb sp!, {r4, lr}
				419	subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
				420	blt .Lmemcpy_bl32
				421
				422	/* blat 32 bytes at a time */
				423	/* XXX for really big copies perhaps we should use more registers */
				424	.Lmemcpy_bloop32:
				425	ldmdb r1!, {r3, r4, r12, lr}
				426	stmdb r0!, {r3, r4, r12, lr}
				427	ldmdb r1!, {r3, r4, r12, lr}
				428	stmdb r0!, {r3, r4, r12, lr}
				429	subs r2, r2, #0x20
				430	bge .Lmemcpy_bloop32
				431
				432	.Lmemcpy_bl32:
				433	cmn r2, #0x10
				434	/* blat a remaining 16 bytes */
				435	copydb "{r3, r4, r12, lr}"
				436	subge r2, r2, #0x10
				437	adds r2, r2, #0x14
				438	/* blat a remaining 12 bytes */
				439	copydb "{r3, r12, lr}"
				440	subge r2, r2, #0x0c
				441	ldmia sp!, {r4, lr}
				442
				443	.Lmemcpy_bl12:
				444	adds r2, r2, #8
				445	blt .Lmemcpy_bl4
				446	subs r2, r2, #4
				447	IT(tt, lt)
				448	ldrlt r3, [r1, #-4]!
				449	strlt r3, [r0, #-4]!
				450	copydb "{r3, r12}"
				451	subge r2, r2, #4
				452
				453	.Lmemcpy_bl4:
				454	/* less than 4 bytes to go */
				455	adds r2, r2, #4
				456	IT(t, eq)
				457	#if defined(__USE_BX__)
				458	bxeq lr
				459	#else
				460	moveq pc, lr /* done */
				461	#endif
				462	/* copy the crud byte at a time */
				463	cmp r2, #2
				464	ldrb r3, [r1, #-1]!
				465	strb r3, [r0, #-1]!
				466	#ifdef __thumb2__
				467	itt ge
				468	ldrbge r3, [r1, #-1]!
				469	strbge r3, [r0, #-1]!
				470	itt gt
				471	ldrbgt r3, [r1, #-1]!
				472	strbgt r3, [r0, #-1]!
				473	#else
				474	ldrgeb r3, [r1, #-1]!
				475	strgeb r3, [r0, #-1]!
				476	ldrgtb r3, [r1, #-1]!
				477	strgtb r3, [r0, #-1]!
				478	#endif
				479	#if defined(__USE_BX__)
				480	bx lr
				481	#else
				482	mov pc, lr
				483	#endif
				484	/* erg - unaligned destination */
				485	.Lmemcpy_bdestul:
				486	cmp r12, #2
				487
				488	/* align destination with byte copies */
				489	ldrb r3, [r1, #-1]!
				490	strb r3, [r0, #-1]!
				491	#ifdef __thumb2__
				492	itt ge
				493	ldrbge r3, [r1, #-1]!
				494	strbge r3, [r0, #-1]!
				495	itt gt
				496	ldrbgt r3, [r1, #-1]!
				497	strbgt r3, [r0, #-1]!
				498	#else
				499	ldrgeb r3, [r1, #-1]!
				500	strgeb r3, [r0, #-1]!
				501	ldrgtb r3, [r1, #-1]!
				502	strgtb r3, [r0, #-1]!
				503	#endif
				504	subs r2, r2, r12
				505	blt .Lmemcpy_bl4 /* less than 4 bytes to go */
				506	ands r12, r1, #3
				507	beq .Lmemcpy_bt8 /* we have an aligned source */
				508
				509	/* erg - unaligned source */
				510	/* This is where it gets nasty ... */
				511	.Lmemcpy_bsrcul:
				512	bic r1, r1, #3
				513	ldr r3, [r1, #0]
				514	cmp r12, #2
				515	blt .Lmemcpy_bsrcul1
				516	beq .Lmemcpy_bsrcul2
				517	cmp r2, #0x0c
				518	blt .Lmemcpy_bsrcul3loop4
				519	sub r2, r2, #0x0c
				520	stmdb sp!, {r4, r5, lr}
				521
				522	.Lmemcpy_bsrcul3loop16:
				523	#if __BYTE_ORDER == __BIG_ENDIAN
				524	mov lr, r3, lsr #8
				525	ldmdb r1!, {r3-r5, r12}
				526	orr lr, lr, r12, lsl #24
				527	mov r12, r12, lsr #8
				528	orr r12, r12, r5, lsl #24
				529	mov r5, r5, lsr #8
				530	orr r5, r5, r4, lsl #24
				531	mov r4, r4, lsr #8
				532	orr r4, r4, r3, lsl #24
				533	#else
				534	mov lr, r3, lsl #8
				535	ldmdb r1!, {r3-r5, r12}
				536	orr lr, lr, r12, lsr #24
				537	mov r12, r12, lsl #8
				538	orr r12, r12, r5, lsr #24
				539	mov r5, r5, lsl #8
				540	orr r5, r5, r4, lsr #24
				541	mov r4, r4, lsl #8
				542	orr r4, r4, r3, lsr #24
				543	#endif
				544	stmdb r0!, {r4, r5, r12, lr}
				545	subs r2, r2, #0x10
				546	bge .Lmemcpy_bsrcul3loop16
				547	ldmia sp!, {r4, r5, lr}
				548	adds r2, r2, #0x0c
				549	blt .Lmemcpy_bsrcul3l4
				550
				551	.Lmemcpy_bsrcul3loop4:
				552	#if __BYTE_ORDER == __BIG_ENDIAN
				553	mov r12, r3, lsr #8
				554	ldr r3, [r1, #-4]!
				555	orr r12, r12, r3, lsl #24
				556	#else
				557	mov r12, r3, lsl #8
				558	ldr r3, [r1, #-4]!
				559	orr r12, r12, r3, lsr #24
				560	#endif
				561	str r12, [r0, #-4]!
				562	subs r2, r2, #4
				563	bge .Lmemcpy_bsrcul3loop4
				564
				565	.Lmemcpy_bsrcul3l4:
				566	add r1, r1, #3
				567	b .Lmemcpy_bl4
				568
				569	.Lmemcpy_bsrcul2:
				570	cmp r2, #0x0c
				571	blt .Lmemcpy_bsrcul2loop4
				572	sub r2, r2, #0x0c
				573	stmdb sp!, {r4, r5, lr}
				574
				575	.Lmemcpy_bsrcul2loop16:
				576	#if __BYTE_ORDER == __BIG_ENDIAN
				577	mov lr, r3, lsr #16
				578	ldmdb r1!, {r3-r5, r12}
				579	orr lr, lr, r12, lsl #16
				580	mov r12, r12, lsr #16
				581	orr r12, r12, r5, lsl #16
				582	mov r5, r5, lsr #16
				583	orr r5, r5, r4, lsl #16
				584	mov r4, r4, lsr #16
				585	orr r4, r4, r3, lsl #16
				586	#else
				587	mov lr, r3, lsl #16
				588	ldmdb r1!, {r3-r5, r12}
				589	orr lr, lr, r12, lsr #16
				590	mov r12, r12, lsl #16
				591	orr r12, r12, r5, lsr #16
				592	mov r5, r5, lsl #16
				593	orr r5, r5, r4, lsr #16
				594	mov r4, r4, lsl #16
				595	orr r4, r4, r3, lsr #16
				596	#endif
				597	stmdb r0!, {r4, r5, r12, lr}
				598	subs r2, r2, #0x10
				599	bge .Lmemcpy_bsrcul2loop16
				600	ldmia sp!, {r4, r5, lr}
				601	adds r2, r2, #0x0c
				602	blt .Lmemcpy_bsrcul2l4
				603
				604	.Lmemcpy_bsrcul2loop4:
				605	#if __BYTE_ORDER == __BIG_ENDIAN
				606	mov r12, r3, lsr #16
				607	ldr r3, [r1, #-4]!
				608	orr r12, r12, r3, lsl #16
				609	#else
				610	mov r12, r3, lsl #16
				611	ldr r3, [r1, #-4]!
				612	orr r12, r12, r3, lsr #16
				613	#endif
				614	str r12, [r0, #-4]!
				615	subs r2, r2, #4
				616	bge .Lmemcpy_bsrcul2loop4
				617
				618	.Lmemcpy_bsrcul2l4:
				619	add r1, r1, #2
				620	b .Lmemcpy_bl4
				621
				622	.Lmemcpy_bsrcul1:
				623	cmp r2, #0x0c
				624	blt .Lmemcpy_bsrcul1loop4
				625	sub r2, r2, #0x0c
				626	stmdb sp!, {r4, r5, lr}
				627
				628	.Lmemcpy_bsrcul1loop32:
				629	#if __BYTE_ORDER == __BIG_ENDIAN
				630	mov lr, r3, lsr #24
				631	ldmdb r1!, {r3-r5, r12}
				632	orr lr, lr, r12, lsl #8
				633	mov r12, r12, lsr #24
				634	orr r12, r12, r5, lsl #8
				635	mov r5, r5, lsr #24
				636	orr r5, r5, r4, lsl #8
				637	mov r4, r4, lsr #24
				638	orr r4, r4, r3, lsl #8
				639	#else
				640	mov lr, r3, lsl #24
				641	ldmdb r1!, {r3-r5, r12}
				642	orr lr, lr, r12, lsr #8
				643	mov r12, r12, lsl #24
				644	orr r12, r12, r5, lsr #8
				645	mov r5, r5, lsl #24
				646	orr r5, r5, r4, lsr #8
				647	mov r4, r4, lsl #24
				648	orr r4, r4, r3, lsr #8
				649	#endif
				650	stmdb r0!, {r4, r5, r12, lr}
				651	subs r2, r2, #0x10
				652	bge .Lmemcpy_bsrcul1loop32
				653	ldmia sp!, {r4, r5, lr}
				654	adds r2, r2, #0x0c
				655	blt .Lmemcpy_bsrcul1l4
				656
				657	.Lmemcpy_bsrcul1loop4:
				658	#if __BYTE_ORDER == __BIG_ENDIAN
				659	mov r12, r3, lsr #24
				660	ldr r3, [r1, #-4]!
				661	orr r12, r12, r3, lsl #8
				662	#else
				663	mov r12, r3, lsl #24
				664	ldr r3, [r1, #-4]!
				665	orr r12, r12, r3, lsr #8
				666	#endif
				667	str r12, [r0, #-4]!
				668	subs r2, r2, #4
				669	bge .Lmemcpy_bsrcul1loop4
				670
				671	.Lmemcpy_bsrcul1l4:
				672	add r1, r1, #1
				673	b .Lmemcpy_bl4
				674
				675	#else /* THUMB1_ONLY */
				676
				677	/* This is a fairly dumb implementation for when we can't use the 32-bit code
				678	above. */
				679	.text
				680	.global _memcpy
				681	.hidden _memcpy
				682	.type _memcpy,%function
				683	.align 4
				684	.thumb
				685	_memcpy:
				686	push {r0, r4}
				687	cmp r2, #0
				688	beq .Lmemcpy_exit
				689	@ See if we have overlapping regions, and need to reverse the
				690	@ direction of the copy
				691	cmp r0, r1
				692	bls .Lmemcpy_forwards
				693	add r4, r1, r2
				694	cmp r0, r4
				695	bcc .Lmemcpy_backwards
				696	.Lmemcpy_forwards:
				697	/* Forwards. */
				698	mov r3, r0
				699	eor r3, r1
				700	mov r4, #3
				701	tst r3, r4
				702	bne .Lmemcpy_funaligned
				703	cmp r2, #8
				704	bcc .Lmemcpy_funaligned
				705	1: @ copy up to the first word boundary.
				706	tst r0, r4
				707	beq 1f
				708	ldrb r3, [r1]
				709	add r1, r1, #1
				710	strb r3, [r0]
				711	add r0, r0, #1
				712	sub r2, r2, #1
				713	b 1b
				714	1: @ Copy aligned words
				715	ldr r3, [r1]
				716	add r1, r1, #4
				717	str r3, [r0]
				718	add r0, r0, #4
				719	sub r2, r2, #4
				720	cmp r2, #4
				721	bcs 1b
				722	cmp r2, #0
				723	beq .Lmemcpy_exit
				724	.Lmemcpy_funaligned:
				725	1:
				726	ldrb r3, [r1]
				727	add r1, r1, #1
				728	strb r3, [r0]
				729	add r0, r0, #1
				730	sub r2, r2, #1
				731	bne 1b
				732	.Lmemcpy_exit:
				733	pop {r0, r4}
				734	bx lr
				735
				736	.Lmemcpy_backwards:
				737	add r0, r0, r2
				738	add r1, r1, r2
				739	1:
				740	sub r0, r0, #1
				741	sub r1, r1, #1
				742	ldrb r3, [r1]
				743	strb r3, [r0]
				744	sub r2, r2, #1
				745	bne 1b
				746	b .Lmemcpy_exit
				747	#endif