Blame - ap/libc/glibc/glibc-2.22/sysdeps/ia64/memmove.S - T106_DC

blob: 3927ceb011adcbad4627bd8bc05e02eb7e00137b [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame^]	1	/* Optimized version of the standard memmove() function.
				2	This file is part of the GNU C Library.
				3	Copyright (C) 2000-2015 Free Software Foundation, Inc.
				4	Contributed by Dan Pop <Dan.Pop@cern.ch>.
				5
				6	The GNU C Library is free software; you can redistribute it and/or
				7	modify it under the terms of the GNU Lesser General Public
				8	License as published by the Free Software Foundation; either
				9	version 2.1 of the License, or (at your option) any later version.
				10
				11	The GNU C Library is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	Lesser General Public License for more details.
				15
				16	You should have received a copy of the GNU Lesser General Public
				17	License along with the GNU C Library; if not, see
				18	<http://www.gnu.org/licenses/>. */
				19
				20	/* Return: dest
				21
				22	Inputs:
				23	in0: dest
				24	in1: src
				25	in2: byte count
				26
				27	The core of the function is the memcpy implementation used in memcpy.S.
				28	When bytes have to be copied backwards, only the easy case, when
				29	all arguments are multiples of 8, is optimised.
				30
				31	In this form, it assumes little endian mode. For big endian mode,
				32	sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
				33	or the UM.be bit should be cleared at the beginning and set at the end. */
				34
				35	#include <sysdep.h>
				36	#undef ret
				37
				38	#define OP_T_THRES 16
				39	#define OPSIZ 8
				40
				41	#define adest r15
				42	#define saved_pr r17
				43	#define saved_lc r18
				44	#define dest r19
				45	#define src r20
				46	#define len r21
				47	#define asrc r22
				48	#define tmp2 r23
				49	#define tmp3 r24
				50	#define tmp4 r25
				51	#define ptable r26
				52	#define ploop56 r27
				53	#define loopaddr r28
				54	#define sh1 r29
				55	#define loopcnt r30
				56	#define value r31
				57
				58	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				59	# define ALIGN(n) { nop 0 }
				60	#else
				61	# define ALIGN(n) .align n
				62	#endif
				63
				64	#define LOOP(shift) \
				65	ALIGN(32); \
				66	.loop##shift##: \
				67	(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
				68	(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
				69	(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
				70	nop.b 0 ; \
				71	nop.b 0 ; \
				72	br.ctop.sptk .loop##shift ; \
				73	br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
				74
				75	#define MEMLAT 21
				76	#define Nrot (((2*MEMLAT+3) + 7) & ~7)
				77
				78	ENTRY(memmove)
				79	.prologue
				80	alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
				81	.rotr r[MEMLAT + 2], q[MEMLAT + 1]
				82	.rotp p[MEMLAT + 2]
				83	mov ret0 = in0 // return value = dest
				84	.save pr, saved_pr
				85	mov saved_pr = pr // save the predicate registers
				86	.save ar.lc, saved_lc
				87	mov saved_lc = ar.lc // save the loop counter
				88	.body
				89	or tmp3 = in0, in1 ;; // tmp3 = dest \| src
				90	or tmp3 = tmp3, in2 // tmp3 = dest \| src \| len
				91	mov dest = in0 // dest
				92	mov src = in1 // src
				93	mov len = in2 // len
				94	sub tmp2 = r0, in0 // tmp2 = -dest
				95	cmp.eq p6, p0 = in2, r0 // if (len == 0)
				96	(p6) br.cond.spnt .restore_and_exit;;// return dest;
				97	and tmp4 = 7, tmp3 // tmp4 = (dest \| src \| len) & 7
				98	cmp.le p6, p0 = dest, src // if dest <= src it's always safe
				99	(p6) br.cond.spnt .forward // to copy forward
				100	add tmp3 = src, len;;
				101	cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
				102	(p6) br.cond.spnt .backward // we have to copy backward
				103
				104	.forward:
				105	shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
				106	cmp.ne p6, p0 = tmp4, r0 // if ((dest \| src \| len) & 7 != 0)
				107	(p6) br.cond.sptk .next // goto next;
				108
				109	// The optimal case, when dest, src and len are all multiples of 8
				110
				111	and tmp3 = 0xf, len
				112	mov pr.rot = 1 << 16 // set rotating predicates
				113	mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
				114	cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
				115	adds loopcnt = -1, loopcnt;; // --loopcnt
				116	(p6) ld8 value = [src], 8;;
				117	(p6) st8 [dest] = value, 8 // copy the "odd" word
				118	mov ar.lc = loopcnt // set the loop counter
				119	cmp.eq p6, p0 = 8, len
				120	(p6) br.cond.spnt .restore_and_exit;;// the one-word special case
				121	adds adest = 8, dest // set adest one word ahead of dest
				122	adds asrc = 8, src ;; // set asrc one word ahead of src
				123	nop.b 0 // get the "golden" alignment for
				124	nop.b 0 // the next loop
				125	.l0:
				126	(p[0]) ld8 r[0] = [src], 16
				127	(p[0]) ld8 q[0] = [asrc], 16
				128	(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
				129	(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
				130	br.ctop.dptk .l0 ;;
				131
				132	mov pr = saved_pr, -1 // restore the predicate registers
				133	mov ar.lc = saved_lc // restore the loop counter
				134	br.ret.sptk.many b0
				135	.next:
				136	cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
				137	and loopcnt = 7, tmp2 // loopcnt = -dest % 8
				138	(p6) br.cond.spnt .cpyfew // copy byte by byte
				139	;;
				140	cmp.eq p6, p0 = loopcnt, r0
				141	(p6) br.cond.sptk .dest_aligned
				142	sub len = len, loopcnt // len -= -dest % 8
				143	adds loopcnt = -1, loopcnt // --loopcnt
				144	;;
				145	mov ar.lc = loopcnt
				146	.l1: // copy -dest % 8 bytes
				147	ld1 value = [src], 1 // value = *src++
				148	;;
				149	st1 [dest] = value, 1 // *dest++ = value
				150	br.cloop.dptk .l1
				151	.dest_aligned:
				152	and sh1 = 7, src // sh1 = src % 8
				153	and tmp2 = -8, len // tmp2 = len & -OPSIZ
				154	and asrc = -8, src // asrc = src & -OPSIZ -- align src
				155	shr.u loopcnt = len, 3 // loopcnt = len / 8
				156	and len = 7, len;; // len = len % 8
				157	adds loopcnt = -1, loopcnt // --loopcnt
				158	addl tmp4 = @ltoff(.table), gp
				159	addl tmp3 = @ltoff(.loop56), gp
				160	mov ar.ec = MEMLAT + 1 // set EC
				161	mov pr.rot = 1 << 16;; // set rotating predicates
				162	mov ar.lc = loopcnt // set LC
				163	cmp.eq p6, p0 = sh1, r0 // is the src aligned?
				164	(p6) br.cond.sptk .src_aligned
				165	add src = src, tmp2 // src += len & -OPSIZ
				166	shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
				167	ld8 ploop56 = [tmp3] // ploop56 = &loop56
				168	ld8 ptable = [tmp4];; // ptable = &table
				169	add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
				170	mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
				171	ld8 tmp4 = [tmp3];; // tmp4 = loop offset
				172	sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
				173	ld8 r[1] = [asrc], 8;; // w0
				174	mov b6 = loopaddr;;
				175	br b6 // jump to the appropriate loop
				176
				177	LOOP(8)
				178	LOOP(16)
				179	LOOP(24)
				180	LOOP(32)
				181	LOOP(40)
				182	LOOP(48)
				183	LOOP(56)
				184
				185	.src_aligned:
				186	.l3:
				187	(p[0]) ld8 r[0] = [src], 8
				188	(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
				189	br.ctop.dptk .l3
				190	.cpyfew:
				191	cmp.eq p6, p0 = len, r0 // is len == 0 ?
				192	adds len = -1, len // --len;
				193	(p6) br.cond.spnt .restore_and_exit ;;
				194	mov ar.lc = len
				195	.l4:
				196	ld1 value = [src], 1
				197	;;
				198	st1 [dest] = value, 1
				199	br.cloop.dptk .l4 ;;
				200	.restore_and_exit:
				201	mov pr = saved_pr, -1 // restore the predicate registers
				202	mov ar.lc = saved_lc // restore the loop counter
				203	br.ret.sptk.many b0
				204
				205	// In the case of a backward copy, optimise only the case when everything
				206	// is a multiple of 8, otherwise copy byte by byte. The backward copy is
				207	// used only when the blocks are overlapping and dest > src.
				208
				209	.backward:
				210	shr.u loopcnt = len, 3 // loopcnt = len / 8
				211	add src = src, len // src points one byte past the end
				212	add dest = dest, len ;; // dest points one byte past the end
				213	mov ar.ec = MEMLAT + 1 // set the epilog counter
				214	mov pr.rot = 1 << 16 // set rotating predicates
				215	adds loopcnt = -1, loopcnt // --loopcnt
				216	cmp.ne p6, p0 = tmp4, r0 // if ((dest \| src \| len) & 7 != 0)
				217	(p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
				218	adds src = -8, src // src points to the last word
				219	adds dest = -8, dest // dest points to the last word
				220	mov ar.lc = loopcnt;; // set the loop counter
				221	.l5:
				222	(p[0]) ld8 r[0] = [src], -8
				223	(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
				224	br.ctop.dptk .l5
				225	br.cond.sptk .restore_and_exit
				226	.bytecopy:
				227	adds src = -1, src // src points to the last byte
				228	adds dest = -1, dest // dest points to the last byte
				229	adds loopcnt = -1, len;; // loopcnt = len - 1
				230	mov ar.lc = loopcnt;; // set the loop counter
				231	.l6:
				232	(p[0]) ld1 r[0] = [src], -1
				233	(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
				234	br.ctop.dptk .l6
				235	br.cond.sptk .restore_and_exit
				236	END(memmove)
				237
				238	.rodata
				239	.align 8
				240	.table:
				241	data8 0 // dummy entry
				242	data8 .loop56 - .loop8
				243	data8 .loop56 - .loop16
				244	data8 .loop56 - .loop24
				245	data8 .loop56 - .loop32
				246	data8 .loop56 - .loop40
				247	data8 .loop56 - .loop48
				248	data8 .loop56 - .loop56
				249
				250	libc_hidden_builtin_def (memmove)