Blame - ap/libc/glibc/glibc-2.23/sysdeps/aarch64/memcpy.S - T106_DC

blob: 442f39042611dde08c2cf78d5a3efb1c5d0e5a36 [file] [log] [blame]

xf.li	bdd93d5	2023-05-12 07:10:14 -0700	[diff] [blame^]	1	/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
				2
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library. If not, see
				17	<http://www.gnu.org/licenses/>. */
				18
				19	/* Assumptions:
				20	*
				21	* ARMv8-a, AArch64
				22	* Unaligned accesses
				23	*
				24	*/
				25
				26	#define dstin x0
				27	#define src x1
				28	#define count x2
				29	#define tmp1 x3
				30	#define tmp1w w3
				31	#define tmp2 x4
				32	#define tmp2w w4
				33	#define tmp3 x5
				34	#define tmp3w w5
				35	#define dst x6
				36
				37	#define A_l x7
				38	#define A_h x8
				39	#define B_l x9
				40	#define B_h x10
				41	#define C_l x11
				42	#define C_h x12
				43	#define D_l x13
				44	#define D_h x14
				45
				46	#include <sysdep.h>
				47
				48	ENTRY_ALIGN (memcpy, 6)
				49
				50	mov dst, dstin
				51	cmp count, #64
				52	b.ge L(cpy_not_short)
				53	cmp count, #15
				54	b.le L(tail15tiny)
				55
				56	/* Deal with small copies quickly by dropping straight into the
				57	* exit block. */
				58	L(tail63):
				59	/* Copy up to 48 bytes of data. At this point we only need the
				60	* bottom 6 bits of count to be accurate. */
				61	ands tmp1, count, #0x30
				62	b.eq L(tail15)
				63	add dst, dst, tmp1
				64	add src, src, tmp1
				65	cmp tmp1w, #0x20
				66	b.eq 1f
				67	b.lt 2f
				68	ldp A_l, A_h, [src, #-48]
				69	stp A_l, A_h, [dst, #-48]
				70	1:
				71	ldp A_l, A_h, [src, #-32]
				72	stp A_l, A_h, [dst, #-32]
				73	2:
				74	ldp A_l, A_h, [src, #-16]
				75	stp A_l, A_h, [dst, #-16]
				76
				77	L(tail15):
				78	ands count, count, #15
				79	beq 1f
				80	add src, src, count
				81	ldp A_l, A_h, [src, #-16]
				82	add dst, dst, count
				83	stp A_l, A_h, [dst, #-16]
				84	1:
				85	RET
				86
				87	L(tail15tiny):
				88	/* Copy up to 15 bytes of data. Does not assume additional data
				89	being copied. */
				90	tbz count, #3, 1f
				91	ldr tmp1, [src], #8
				92	str tmp1, [dst], #8
				93	1:
				94	tbz count, #2, 1f
				95	ldr tmp1w, [src], #4
				96	str tmp1w, [dst], #4
				97	1:
				98	tbz count, #1, 1f
				99	ldrh tmp1w, [src], #2
				100	strh tmp1w, [dst], #2
				101	1:
				102	tbz count, #0, 1f
				103	ldrb tmp1w, [src]
				104	strb tmp1w, [dst]
				105	1:
				106	RET
				107
				108	L(cpy_not_short):
				109	/* We don't much care about the alignment of DST, but we want SRC
				110	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				111	* boundaries on both loads and stores. */
				112	neg tmp2, src
				113	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
				114	b.eq 2f
				115	sub count, count, tmp2
				116	/* Copy more data than needed; it's faster than jumping
				117	* around copying sub-Quadword quantities. We know that
				118	* it can't overrun. */
				119	ldp A_l, A_h, [src]
				120	add src, src, tmp2
				121	stp A_l, A_h, [dst]
				122	add dst, dst, tmp2
				123	/* There may be less than 63 bytes to go now. */
				124	cmp count, #63
				125	b.le L(tail63)
				126	2:
				127	subs count, count, #128
				128	b.ge L(cpy_body_large)
				129	/* Less than 128 bytes to copy, so handle 64 here and then jump
				130	* to the tail. */
				131	ldp A_l, A_h, [src]
				132	ldp B_l, B_h, [src, #16]
				133	ldp C_l, C_h, [src, #32]
				134	ldp D_l, D_h, [src, #48]
				135	stp A_l, A_h, [dst]
				136	stp B_l, B_h, [dst, #16]
				137	stp C_l, C_h, [dst, #32]
				138	stp D_l, D_h, [dst, #48]
				139	tst count, #0x3f
				140	add src, src, #64
				141	add dst, dst, #64
				142	b.ne L(tail63)
				143	RET
				144
				145	/* Critical loop. Start at a new cache line boundary. Assuming
				146	* 64 bytes per line this ensures the entire loop is in one line. */
				147	.p2align 6
				148	L(cpy_body_large):
				149	/* There are at least 128 bytes to copy. */
				150	ldp A_l, A_h, [src, #0]
				151	sub dst, dst, #16 /* Pre-bias. */
				152	ldp B_l, B_h, [src, #16]
				153	ldp C_l, C_h, [src, #32]
				154	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
				155	1:
				156	stp A_l, A_h, [dst, #16]
				157	ldp A_l, A_h, [src, #16]
				158	stp B_l, B_h, [dst, #32]
				159	ldp B_l, B_h, [src, #32]
				160	stp C_l, C_h, [dst, #48]
				161	ldp C_l, C_h, [src, #48]
				162	stp D_l, D_h, [dst, #64]!
				163	ldp D_l, D_h, [src, #64]!
				164	subs count, count, #64
				165	b.ge 1b
				166	stp A_l, A_h, [dst, #16]
				167	stp B_l, B_h, [dst, #32]
				168	stp C_l, C_h, [dst, #48]
				169	stp D_l, D_h, [dst, #64]
				170	add src, src, #16
				171	add dst, dst, #64 + 16
				172	tst count, #0x3f
				173	b.ne L(tail63)
				174	RET
				175	END (memcpy)
				176	libc_hidden_builtin_def (memcpy)