Blame - ap/libc/glibc/glibc-2.22/sysdeps/aarch64/memmove.S - T106_DC

blob: 8d0b32847ec182992f951eff6cf385f512d01861 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
				2
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library. If not, see
				17	<http://www.gnu.org/licenses/>. */
				18
				19	#include <sysdep.h>
				20
				21	/* Assumptions:
				22	*
				23	* ARMv8-a, AArch64
				24	* Unaligned accesses
				25	*/
				26
				27	/* Parameters and result. */
				28	#define dstin x0
				29	#define src x1
				30	#define count x2
				31	#define tmp1 x3
				32	#define tmp1w w3
				33	#define tmp2 x4
				34	#define tmp2w w4
				35	#define tmp3 x5
				36	#define tmp3w w5
				37	#define dst x6
				38
				39	#define A_l x7
				40	#define A_h x8
				41	#define B_l x9
				42	#define B_h x10
				43	#define C_l x11
				44	#define C_h x12
				45	#define D_l x13
				46	#define D_h x14
				47
				48	ENTRY_ALIGN (memmove, 6)
				49
				50	cmp dstin, src
				51	b.lo L(downwards)
				52	add tmp1, src, count
				53	cmp dstin, tmp1
				54	b.hs memcpy /* No overlap. */
				55
				56	/* Upwards move with potential overlap.
				57	* Need to move from the tail backwards. SRC and DST point one
				58	* byte beyond the remaining data to move. */
				59	add dst, dstin, count
				60	add src, src, count
				61	cmp count, #64
				62	b.ge L(mov_not_short_up)
				63
				64	/* Deal with small moves quickly by dropping straight into the
				65	* exit block. */
				66	L(tail63up):
				67	/* Move up to 48 bytes of data. At this point we only need the
				68	* bottom 6 bits of count to be accurate. */
				69	ands tmp1, count, #0x30
				70	b.eq L(tail15up)
				71	sub dst, dst, tmp1
				72	sub src, src, tmp1
				73	cmp tmp1w, #0x20
				74	b.eq 1f
				75	b.lt 2f
				76	ldp A_l, A_h, [src, #32]
				77	stp A_l, A_h, [dst, #32]
				78	1:
				79	ldp A_l, A_h, [src, #16]
				80	stp A_l, A_h, [dst, #16]
				81	2:
				82	ldp A_l, A_h, [src]
				83	stp A_l, A_h, [dst]
				84	L(tail15up):
				85	/* Move up to 15 bytes of data. Does not assume additional data
				86	* being moved. */
				87	tbz count, #3, 1f
				88	ldr tmp1, [src, #-8]!
				89	str tmp1, [dst, #-8]!
				90	1:
				91	tbz count, #2, 1f
				92	ldr tmp1w, [src, #-4]!
				93	str tmp1w, [dst, #-4]!
				94	1:
				95	tbz count, #1, 1f
				96	ldrh tmp1w, [src, #-2]!
				97	strh tmp1w, [dst, #-2]!
				98	1:
				99	tbz count, #0, 1f
				100	ldrb tmp1w, [src, #-1]
				101	strb tmp1w, [dst, #-1]
				102	1:
				103	RET
				104
				105	L(mov_not_short_up):
				106	/* We don't much care about the alignment of DST, but we want SRC
				107	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				108	* boundaries on both loads and stores. */
				109	ands tmp2, src, #15 /* Bytes to reach alignment. */
				110	b.eq 2f
				111	sub count, count, tmp2
				112	/* Move enough data to reach alignment; unlike memcpy, we have to
				113	* be aware of the overlap, which means we can't move data twice. */
				114	tbz tmp2, #3, 1f
				115	ldr tmp1, [src, #-8]!
				116	str tmp1, [dst, #-8]!
				117	1:
				118	tbz tmp2, #2, 1f
				119	ldr tmp1w, [src, #-4]!
				120	str tmp1w, [dst, #-4]!
				121	1:
				122	tbz tmp2, #1, 1f
				123	ldrh tmp1w, [src, #-2]!
				124	strh tmp1w, [dst, #-2]!
				125	1:
				126	tbz tmp2, #0, 1f
				127	ldrb tmp1w, [src, #-1]!
				128	strb tmp1w, [dst, #-1]!
				129	1:
				130
				131	/* There may be less than 63 bytes to go now. */
				132	cmp count, #63
				133	b.le L(tail63up)
				134	2:
				135	subs count, count, #128
				136	b.ge L(mov_body_large_up)
				137	/* Less than 128 bytes to move, so handle 64 here and then jump
				138	* to the tail. */
				139	ldp A_l, A_h, [src, #-64]!
				140	ldp B_l, B_h, [src, #16]
				141	ldp C_l, C_h, [src, #32]
				142	ldp D_l, D_h, [src, #48]
				143	stp A_l, A_h, [dst, #-64]!
				144	stp B_l, B_h, [dst, #16]
				145	stp C_l, C_h, [dst, #32]
				146	stp D_l, D_h, [dst, #48]
				147	tst count, #0x3f
				148	b.ne L(tail63up)
				149	RET
				150
				151	/* Critical loop. Start at a new Icache line boundary. Assuming
				152	* 64 bytes per line this ensures the entire loop is in one line. */
				153	.p2align 6
				154	L(mov_body_large_up):
				155	/* There are at least 128 bytes to move. */
				156	ldp A_l, A_h, [src, #-16]
				157	ldp B_l, B_h, [src, #-32]
				158	ldp C_l, C_h, [src, #-48]
				159	ldp D_l, D_h, [src, #-64]!
				160	1:
				161	stp A_l, A_h, [dst, #-16]
				162	ldp A_l, A_h, [src, #-16]
				163	stp B_l, B_h, [dst, #-32]
				164	ldp B_l, B_h, [src, #-32]
				165	stp C_l, C_h, [dst, #-48]
				166	ldp C_l, C_h, [src, #-48]
				167	stp D_l, D_h, [dst, #-64]!
				168	ldp D_l, D_h, [src, #-64]!
				169	subs count, count, #64
				170	b.ge 1b
				171	stp A_l, A_h, [dst, #-16]
				172	stp B_l, B_h, [dst, #-32]
				173	stp C_l, C_h, [dst, #-48]
				174	stp D_l, D_h, [dst, #-64]!
				175	tst count, #0x3f
				176	b.ne L(tail63up)
				177	RET
				178
				179	L(downwards):
				180	/* For a downwards move we can safely use memcpy provided that
				181	* DST is more than 16 bytes away from SRC. */
				182	sub tmp1, src, #16
				183	cmp dstin, tmp1
				184	b.ls memcpy /* May overlap, but not critically. */
				185
				186	mov dst, dstin /* Preserve DSTIN for return value. */
				187	cmp count, #64
				188	b.ge L(mov_not_short_down)
				189
				190	/* Deal with small moves quickly by dropping straight into the
				191	* exit block. */
				192	L(tail63down):
				193	/* Move up to 48 bytes of data. At this point we only need the
				194	* bottom 6 bits of count to be accurate. */
				195	ands tmp1, count, #0x30
				196	b.eq L(tail15down)
				197	add dst, dst, tmp1
				198	add src, src, tmp1
				199	cmp tmp1w, #0x20
				200	b.eq 1f
				201	b.lt 2f
				202	ldp A_l, A_h, [src, #-48]
				203	stp A_l, A_h, [dst, #-48]
				204	1:
				205	ldp A_l, A_h, [src, #-32]
				206	stp A_l, A_h, [dst, #-32]
				207	2:
				208	ldp A_l, A_h, [src, #-16]
				209	stp A_l, A_h, [dst, #-16]
				210	L(tail15down):
				211	/* Move up to 15 bytes of data. Does not assume additional data
				212	being moved. */
				213	tbz count, #3, 1f
				214	ldr tmp1, [src], #8
				215	str tmp1, [dst], #8
				216	1:
				217	tbz count, #2, 1f
				218	ldr tmp1w, [src], #4
				219	str tmp1w, [dst], #4
				220	1:
				221	tbz count, #1, 1f
				222	ldrh tmp1w, [src], #2
				223	strh tmp1w, [dst], #2
				224	1:
				225	tbz count, #0, 1f
				226	ldrb tmp1w, [src]
				227	strb tmp1w, [dst]
				228	1:
				229	RET
				230
				231	L(mov_not_short_down):
				232	/* We don't much care about the alignment of DST, but we want SRC
				233	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				234	* boundaries on both loads and stores. */
				235	neg tmp2, src
				236	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
				237	b.eq 2f
				238	sub count, count, tmp2
				239	/* Move enough data to reach alignment; unlike memcpy, we have to
				240	* be aware of the overlap, which means we can't move data twice. */
				241	tbz tmp2, #3, 1f
				242	ldr tmp1, [src], #8
				243	str tmp1, [dst], #8
				244	1:
				245	tbz tmp2, #2, 1f
				246	ldr tmp1w, [src], #4
				247	str tmp1w, [dst], #4
				248	1:
				249	tbz tmp2, #1, 1f
				250	ldrh tmp1w, [src], #2
				251	strh tmp1w, [dst], #2
				252	1:
				253	tbz tmp2, #0, 1f
				254	ldrb tmp1w, [src], #1
				255	strb tmp1w, [dst], #1
				256	1:
				257
				258	/* There may be less than 63 bytes to go now. */
				259	cmp count, #63
				260	b.le L(tail63down)
				261	2:
				262	subs count, count, #128
				263	b.ge L(mov_body_large_down)
				264	/* Less than 128 bytes to move, so handle 64 here and then jump
				265	* to the tail. */
				266	ldp A_l, A_h, [src]
				267	ldp B_l, B_h, [src, #16]
				268	ldp C_l, C_h, [src, #32]
				269	ldp D_l, D_h, [src, #48]
				270	stp A_l, A_h, [dst]
				271	stp B_l, B_h, [dst, #16]
				272	stp C_l, C_h, [dst, #32]
				273	stp D_l, D_h, [dst, #48]
				274	tst count, #0x3f
				275	add src, src, #64
				276	add dst, dst, #64
				277	b.ne L(tail63down)
				278	RET
				279
				280	/* Critical loop. Start at a new cache line boundary. Assuming
				281	* 64 bytes per line this ensures the entire loop is in one line. */
				282	.p2align 6
				283	L(mov_body_large_down):
				284	/* There are at least 128 bytes to move. */
				285	ldp A_l, A_h, [src, #0]
				286	sub dst, dst, #16 /* Pre-bias. */
				287	ldp B_l, B_h, [src, #16]
				288	ldp C_l, C_h, [src, #32]
				289	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
				290	1:
				291	stp A_l, A_h, [dst, #16]
				292	ldp A_l, A_h, [src, #16]
				293	stp B_l, B_h, [dst, #32]
				294	ldp B_l, B_h, [src, #32]
				295	stp C_l, C_h, [dst, #48]
				296	ldp C_l, C_h, [src, #48]
				297	stp D_l, D_h, [dst, #64]!
				298	ldp D_l, D_h, [src, #64]!
				299	subs count, count, #64
				300	b.ge 1b
				301	stp A_l, A_h, [dst, #16]
				302	stp B_l, B_h, [dst, #32]
				303	stp C_l, C_h, [dst, #48]
				304	stp D_l, D_h, [dst, #64]
				305	add src, src, #16
				306	add dst, dst, #64 + 16
				307	tst count, #0x3f
				308	b.ne L(tail63down)
				309	RET
				310	END (memmove)
				311
				312	libc_hidden_builtin_def (memmove)