| /* | 
 |  * Copyright (C) 2013 ARM Ltd. | 
 |  * Copyright (C) 2013 Linaro. | 
 |  * | 
 |  * This code is based on glibc cortex strings work originally authored by Linaro | 
 |  * and re-licensed under GPLv2 for the Linux kernel. The original code can | 
 |  * be found @ | 
 |  * | 
 |  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | 
 |  * files/head:/src/aarch64/ | 
 |  * | 
 |  * This program is free software; you can redistribute it and/or modify | 
 |  * it under the terms of the GNU General Public License version 2 as | 
 |  * published by the Free Software Foundation. | 
 |  * | 
 |  * This program is distributed in the hope that it will be useful, | 
 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
 |  * GNU General Public License for more details. | 
 |  * | 
 |  * You should have received a copy of the GNU General Public License | 
 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
 |  */ | 
 |  | 
 | #include <linux/linkage.h> | 
 | #include <asm/assembler.h> | 
 | #include <asm/cache.h> | 
 |  | 
 | /* | 
 |  * Move a buffer from src to test (alignment handled by the hardware). | 
 |  * If dest <= src, call memcpy, otherwise copy in reverse order. | 
 |  * | 
 |  * Parameters: | 
 |  *	x0 - dest | 
 |  *	x1 - src | 
 |  *	x2 - n | 
 |  * Returns: | 
 |  *	x0 - dest | 
 |  */ | 
 | dstin	.req	x0 | 
 | src	.req	x1 | 
 | count	.req	x2 | 
 | tmp1	.req	x3 | 
 | tmp1w	.req	w3 | 
 | tmp2	.req	x4 | 
 | tmp2w	.req	w4 | 
 | tmp3	.req	x5 | 
 | tmp3w	.req	w5 | 
 | dst	.req	x6 | 
 |  | 
 | A_l	.req	x7 | 
 | A_h	.req	x8 | 
 | B_l	.req	x9 | 
 | B_h	.req	x10 | 
 | C_l	.req	x11 | 
 | C_h	.req	x12 | 
 | D_l	.req	x13 | 
 | D_h	.req	x14 | 
 |  | 
 | 	.weak memmove | 
 | ENTRY(__memmove) | 
 | ENTRY(memmove) | 
 | 	cmp	dstin, src | 
 | 	b.lo	__memcpy | 
 | 	add	tmp1, src, count | 
 | 	cmp	dstin, tmp1 | 
 | 	b.hs	__memcpy		/* No overlap.  */ | 
 |  | 
 | 	add	dst, dstin, count | 
 | 	add	src, src, count | 
 | 	cmp	count, #16 | 
 | 	b.lo	.Ltail15  /*probably non-alignment accesses.*/ | 
 |  | 
 | 	ands	tmp2, src, #15     /* Bytes to reach alignment.  */ | 
 | 	b.eq	.LSrcAligned | 
 | 	sub	count, count, tmp2 | 
 | 	/* | 
 | 	* process the aligned offset length to make the src aligned firstly. | 
 | 	* those extra instructions' cost is acceptable. It also make the | 
 | 	* coming accesses are based on aligned address. | 
 | 	*/ | 
 | 	tbz	tmp2, #0, 1f | 
 | 	ldrb	tmp1w, [src, #-1]! | 
 | 	strb	tmp1w, [dst, #-1]! | 
 | 1: | 
 | 	tbz	tmp2, #1, 2f | 
 | 	ldrh	tmp1w, [src, #-2]! | 
 | 	strh	tmp1w, [dst, #-2]! | 
 | 2: | 
 | 	tbz	tmp2, #2, 3f | 
 | 	ldr	tmp1w, [src, #-4]! | 
 | 	str	tmp1w, [dst, #-4]! | 
 | 3: | 
 | 	tbz	tmp2, #3, .LSrcAligned | 
 | 	ldr	tmp1, [src, #-8]! | 
 | 	str	tmp1, [dst, #-8]! | 
 |  | 
 | .LSrcAligned: | 
 | 	cmp	count, #64 | 
 | 	b.ge	.Lcpy_over64 | 
 |  | 
 | 	/* | 
 | 	* Deal with small copies quickly by dropping straight into the | 
 | 	* exit block. | 
 | 	*/ | 
 | .Ltail63: | 
 | 	/* | 
 | 	* Copy up to 48 bytes of data. At this point we only need the | 
 | 	* bottom 6 bits of count to be accurate. | 
 | 	*/ | 
 | 	ands	tmp1, count, #0x30 | 
 | 	b.eq	.Ltail15 | 
 | 	cmp	tmp1w, #0x20 | 
 | 	b.eq	1f | 
 | 	b.lt	2f | 
 | 	ldp	A_l, A_h, [src, #-16]! | 
 | 	stp	A_l, A_h, [dst, #-16]! | 
 | 1: | 
 | 	ldp	A_l, A_h, [src, #-16]! | 
 | 	stp	A_l, A_h, [dst, #-16]! | 
 | 2: | 
 | 	ldp	A_l, A_h, [src, #-16]! | 
 | 	stp	A_l, A_h, [dst, #-16]! | 
 |  | 
 | .Ltail15: | 
 | 	tbz	count, #3, 1f | 
 | 	ldr	tmp1, [src, #-8]! | 
 | 	str	tmp1, [dst, #-8]! | 
 | 1: | 
 | 	tbz	count, #2, 2f | 
 | 	ldr	tmp1w, [src, #-4]! | 
 | 	str	tmp1w, [dst, #-4]! | 
 | 2: | 
 | 	tbz	count, #1, 3f | 
 | 	ldrh	tmp1w, [src, #-2]! | 
 | 	strh	tmp1w, [dst, #-2]! | 
 | 3: | 
 | 	tbz	count, #0, .Lexitfunc | 
 | 	ldrb	tmp1w, [src, #-1] | 
 | 	strb	tmp1w, [dst, #-1] | 
 |  | 
 | .Lexitfunc: | 
 | 	ret | 
 |  | 
 | .Lcpy_over64: | 
 | 	subs	count, count, #128 | 
 | 	b.ge	.Lcpy_body_large | 
 | 	/* | 
 | 	* Less than 128 bytes to copy, so handle 64 bytes here and then jump | 
 | 	* to the tail. | 
 | 	*/ | 
 | 	ldp	A_l, A_h, [src, #-16] | 
 | 	stp	A_l, A_h, [dst, #-16] | 
 | 	ldp	B_l, B_h, [src, #-32] | 
 | 	ldp	C_l, C_h, [src, #-48] | 
 | 	stp	B_l, B_h, [dst, #-32] | 
 | 	stp	C_l, C_h, [dst, #-48] | 
 | 	ldp	D_l, D_h, [src, #-64]! | 
 | 	stp	D_l, D_h, [dst, #-64]! | 
 |  | 
 | 	tst	count, #0x3f | 
 | 	b.ne	.Ltail63 | 
 | 	ret | 
 |  | 
 | 	/* | 
 | 	* Critical loop. Start at a new cache line boundary. Assuming | 
 | 	* 64 bytes per line this ensures the entire loop is in one line. | 
 | 	*/ | 
 | 	.p2align	L1_CACHE_SHIFT | 
 | .Lcpy_body_large: | 
 | 	/* pre-load 64 bytes data. */ | 
 | 	ldp	A_l, A_h, [src, #-16] | 
 | 	ldp	B_l, B_h, [src, #-32] | 
 | 	ldp	C_l, C_h, [src, #-48] | 
 | 	ldp	D_l, D_h, [src, #-64]! | 
 | 1: | 
 | 	/* | 
 | 	* interlace the load of next 64 bytes data block with store of the last | 
 | 	* loaded 64 bytes data. | 
 | 	*/ | 
 | 	stp	A_l, A_h, [dst, #-16] | 
 | 	ldp	A_l, A_h, [src, #-16] | 
 | 	stp	B_l, B_h, [dst, #-32] | 
 | 	ldp	B_l, B_h, [src, #-32] | 
 | 	stp	C_l, C_h, [dst, #-48] | 
 | 	ldp	C_l, C_h, [src, #-48] | 
 | 	stp	D_l, D_h, [dst, #-64]! | 
 | 	ldp	D_l, D_h, [src, #-64]! | 
 | 	subs	count, count, #64 | 
 | 	b.ge	1b | 
 | 	stp	A_l, A_h, [dst, #-16] | 
 | 	stp	B_l, B_h, [dst, #-32] | 
 | 	stp	C_l, C_h, [dst, #-48] | 
 | 	stp	D_l, D_h, [dst, #-64]! | 
 |  | 
 | 	tst	count, #0x3f | 
 | 	b.ne	.Ltail63 | 
 | 	ret | 
 | ENDPIPROC(memmove) | 
 | ENDPROC(__memmove) |