Blame - ap/build/uClibc/libc/string/ia64/memmove.S - T106_DC

blob: beaada6fc73c11fe8ad2debae12ff5c1d3cd7666 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame^]	1	/* Optimized version of the standard memmove() function.
				2	This file is part of the GNU C Library.
				3	Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
				4	Contributed by Dan Pop <Dan.Pop@cern.ch>.
				5
				6	The GNU C Library is free software; you can redistribute it and/or
				7	modify it under the terms of the GNU Lesser General Public
				8	License as published by the Free Software Foundation; either
				9	version 2.1 of the License, or (at your option) any later version.
				10
				11	The GNU C Library is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	Lesser General Public License for more details.
				15
				16	You should have received a copy of the GNU Lesser General Public
				17	License along with the GNU C Library; if not, write to the Free
				18	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				19	02111-1307 USA. */
				20
				21	/* Return: dest
				22
				23	Inputs:
				24	in0: dest
				25	in1: src
				26	in2: byte count
				27
				28	The core of the function is the memcpy implementation used in memcpy.S.
				29	When bytes have to be copied backwards, only the easy case, when
				30	all arguments are multiples of 8, is optimised.
				31
				32	In this form, it assumes little endian mode. For big endian mode,
				33	sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
				34	or the UM.be bit should be cleared at the beginning and set at the end. */
				35
				36	#include "sysdep.h"
				37	#undef ret
				38
				39	#define OP_T_THRES 16
				40	#define OPSIZ 8
				41
				42	#define adest r15
				43	#define saved_pr r17
				44	#define saved_lc r18
				45	#define dest r19
				46	#define src r20
				47	#define len r21
				48	#define asrc r22
				49	#define tmp2 r23
				50	#define tmp3 r24
				51	#define tmp4 r25
				52	#define ptable r26
				53	#define ploop56 r27
				54	#define loopaddr r28
				55	#define sh1 r29
				56	#define loopcnt r30
				57	#define value r31
				58
				59	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				60	# define ALIGN(n) { nop 0 }
				61	#else
				62	# define ALIGN(n) .align n
				63	#endif
				64
				65	#define LOOP(shift) \
				66	ALIGN(32); \
				67	.loop##shift : \
				68	(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
				69	(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
				70	(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
				71	nop.b 0 ; \
				72	nop.b 0 ; \
				73	br.ctop.sptk .loop##shift ; \
				74	br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
				75
				76	#define MEMLAT 21
				77	#define Nrot (((2*MEMLAT+3) + 7) & ~7)
				78
				79	ENTRY(memmove)
				80	.prologue
				81	alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
				82	.rotr r[MEMLAT + 2], q[MEMLAT + 1]
				83	.rotp p[MEMLAT + 2]
				84	mov ret0 = in0 /* return value = dest */
				85	.save pr, saved_pr
				86	mov saved_pr = pr /* save the predicate registers */
				87	.save ar.lc, saved_lc
				88	mov saved_lc = ar.lc /* save the loop counter */
				89	.body
				90	or tmp3 = in0, in1 ;; /* tmp3 = dest \| src */
				91	or tmp3 = tmp3, in2 /* tmp3 = dest \| src \| len */
				92	mov dest = in0 /* dest */
				93	mov src = in1 /* src */
				94	mov len = in2 /* len */
				95	sub tmp2 = r0, in0 /* tmp2 = -dest */
				96	cmp.eq p6, p0 = in2, r0 /* if (len == 0) */
				97	(p6) br.cond.spnt .restore_and_exit;;/* return dest; */
				98	and tmp4 = 7, tmp3 /* tmp4 = (dest \| src \| len) & 7 */
				99	cmp.le p6, p0 = dest, src /* if dest <= src it's always safe */
				100	(p6) br.cond.spnt .forward /* to copy forward */
				101	add tmp3 = src, len;;
				102	cmp.lt p6, p0 = dest, tmp3 /* if dest > src && dest < src + len */
				103	(p6) br.cond.spnt .backward /* we have to copy backward */
				104
				105	.forward:
				106	shr.u loopcnt = len, 4 ;; /* loopcnt = len / 16 */
				107	cmp.ne p6, p0 = tmp4, r0 /* if ((dest \| src \| len) & 7 != 0) */
				108	(p6) br.cond.sptk .next /* goto next; */
				109
				110	/* The optimal case, when dest, src and len are all multiples of 8 */
				111
				112	and tmp3 = 0xf, len
				113	mov pr.rot = 1 << 16 /* set rotating predicates */
				114	mov ar.ec = MEMLAT + 1 ;; /* set the epilog counter */
				115	cmp.ne p6, p0 = tmp3, r0 /* do we have to copy an extra word? */
				116	adds loopcnt = -1, loopcnt;; /* --loopcnt */
				117	(p6) ld8 value = [src], 8;;
				118	(p6) st8 [dest] = value, 8 /* copy the "odd" word */
				119	mov ar.lc = loopcnt /* set the loop counter */
				120	cmp.eq p6, p0 = 8, len
				121	(p6) br.cond.spnt .restore_and_exit;;/* the one-word special case */
				122	adds adest = 8, dest /* set adest one word ahead of dest */
				123	adds asrc = 8, src ;; /* set asrc one word ahead of src */
				124	nop.b 0 /* get the "golden" alignment for */
				125	nop.b 0 /* the next loop */
				126	.l0:
				127	(p[0]) ld8 r[0] = [src], 16
				128	(p[0]) ld8 q[0] = [asrc], 16
				129	(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
				130	(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
				131	br.ctop.dptk .l0 ;;
				132
				133	mov pr = saved_pr, -1 /* restore the predicate registers */
				134	mov ar.lc = saved_lc /* restore the loop counter */
				135	br.ret.sptk.many b0
				136	.next:
				137	cmp.ge p6, p0 = OP_T_THRES, len /* is len <= OP_T_THRES */
				138	and loopcnt = 7, tmp2 /* loopcnt = -dest % 8 */
				139	(p6) br.cond.spnt .cpyfew /* copy byte by byte */
				140	;;
				141	cmp.eq p6, p0 = loopcnt, r0
				142	(p6) br.cond.sptk .dest_aligned
				143	sub len = len, loopcnt /* len -= -dest % 8 */
				144	adds loopcnt = -1, loopcnt /* --loopcnt */
				145	;;
				146	mov ar.lc = loopcnt
				147	.l1: /* copy -dest % 8 bytes */
				148	ld1 value = [src], 1 /* value = src++ /
				149	;;
				150	st1 [dest] = value, 1 /* dest++ = value /
				151	br.cloop.dptk .l1
				152	.dest_aligned:
				153	and sh1 = 7, src /* sh1 = src % 8 */
				154	and tmp2 = -8, len /* tmp2 = len & -OPSIZ */
				155	and asrc = -8, src /* asrc = src & -OPSIZ -- align src */
				156	shr.u loopcnt = len, 3 /* loopcnt = len / 8 */
				157	and len = 7, len;; /* len = len % 8 */
				158	adds loopcnt = -1, loopcnt /* --loopcnt */
				159	addl tmp4 = @ltoff(.table), gp
				160	addl tmp3 = @ltoff(.loop56), gp
				161	mov ar.ec = MEMLAT + 1 /* set EC */
				162	mov pr.rot = 1 << 16;; /* set rotating predicates */
				163	mov ar.lc = loopcnt /* set LC */
				164	cmp.eq p6, p0 = sh1, r0 /* is the src aligned? */
				165	(p6) br.cond.sptk .src_aligned
				166	add src = src, tmp2 /* src += len & -OPSIZ */
				167	shl sh1 = sh1, 3 /* sh1 = 8 * (src % 8) */
				168	ld8 ploop56 = [tmp3] /* ploop56 = &loop56 */
				169	ld8 ptable = [tmp4];; /* ptable = &table */
				170	add tmp3 = ptable, sh1;; /* tmp3 = &table + sh1 */
				171	mov ar.ec = MEMLAT + 1 + 1 /* one more pass needed */
				172	ld8 tmp4 = [tmp3];; /* tmp4 = loop offset */
				173	sub loopaddr = ploop56,tmp4 /* loopadd = &loop56 - loop offset */
				174	ld8 r[1] = [asrc], 8;; /* w0 */
				175	mov b6 = loopaddr;;
				176	br b6 /* jump to the appropriate loop */
				177
				178	LOOP(8)
				179	LOOP(16)
				180	LOOP(24)
				181	LOOP(32)
				182	LOOP(40)
				183	LOOP(48)
				184	LOOP(56)
				185
				186	.src_aligned:
				187	.l3:
				188	(p[0]) ld8 r[0] = [src], 8
				189	(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
				190	br.ctop.dptk .l3
				191	.cpyfew:
				192	cmp.eq p6, p0 = len, r0 /* is len == 0 ? */
				193	adds len = -1, len /* --len; */
				194	(p6) br.cond.spnt .restore_and_exit ;;
				195	mov ar.lc = len
				196	.l4:
				197	ld1 value = [src], 1
				198	;;
				199	st1 [dest] = value, 1
				200	br.cloop.dptk .l4 ;;
				201	.restore_and_exit:
				202	mov pr = saved_pr, -1 /* restore the predicate registers */
				203	mov ar.lc = saved_lc /* restore the loop counter */
				204	br.ret.sptk.many b0
				205
				206	/* In the case of a backward copy, optimise only the case when everything
				207	is a multiple of 8, otherwise copy byte by byte. The backward copy is
				208	used only when the blocks are overlapping and dest > src.
				209	*/
				210	.backward:
				211	shr.u loopcnt = len, 3 /* loopcnt = len / 8 */
				212	add src = src, len /* src points one byte past the end */
				213	add dest = dest, len ;; /* dest points one byte past the end */
				214	mov ar.ec = MEMLAT + 1 /* set the epilog counter */
				215	mov pr.rot = 1 << 16 /* set rotating predicates */
				216	adds loopcnt = -1, loopcnt /* --loopcnt */
				217	cmp.ne p6, p0 = tmp4, r0 /* if ((dest \| src \| len) & 7 != 0) */
				218	(p6) br.cond.sptk .bytecopy ;; /* copy byte by byte backward */
				219	adds src = -8, src /* src points to the last word */
				220	adds dest = -8, dest /* dest points to the last word */
				221	mov ar.lc = loopcnt;; /* set the loop counter */
				222	.l5:
				223	(p[0]) ld8 r[0] = [src], -8
				224	(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
				225	br.ctop.dptk .l5
				226	br.cond.sptk .restore_and_exit
				227	.bytecopy:
				228	adds src = -1, src /* src points to the last byte */
				229	adds dest = -1, dest /* dest points to the last byte */
				230	adds loopcnt = -1, len;; /* loopcnt = len - 1 */
				231	mov ar.lc = loopcnt;; /* set the loop counter */
				232	.l6:
				233	(p[0]) ld1 r[0] = [src], -1
				234	(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
				235	br.ctop.dptk .l6
				236	br.cond.sptk .restore_and_exit
				237	END(memmove)
				238
				239	.rodata
				240	.align 8
				241	.table:
				242	data8 0 /* dummy entry */
				243	data8 .loop56 - .loop8
				244	data8 .loop56 - .loop16
				245	data8 .loop56 - .loop24
				246	data8 .loop56 - .loop32
				247	data8 .loop56 - .loop40
				248	data8 .loop56 - .loop48
				249	data8 .loop56 - .loop56
				250
				251	libc_hidden_def (memmove)