Blame - ap/build/uClibc/libc/string/ia64/memcpy.S - R306

blob: 6c48a72d99cf6efb35efd35692879ee5544eef5d [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame^]	1	/* Optimized version of the standard memcpy() function.
				2	This file is part of the GNU C Library.
				3	Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
				4	Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
				5	Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
				6
				7	The GNU C Library is free software; you can redistribute it and/or
				8	modify it under the terms of the GNU Lesser General Public
				9	License as published by the Free Software Foundation; either
				10	version 2.1 of the License, or (at your option) any later version.
				11
				12	The GNU C Library is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				15	Lesser General Public License for more details.
				16
				17	You should have received a copy of the GNU Lesser General Public
				18	License along with the GNU C Library; if not, write to the Free
				19	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				20	02111-1307 USA. */
				21
				22	/* Return: dest
				23
				24	Inputs:
				25	in0: dest
				26	in1: src
				27	in2: byte count
				28
				29	An assembly implementation of the algorithm used by the generic C
				30	version from glibc. The case when source and sest are aligned is
				31	treated separately, for extra performance.
				32
				33	In this form, memcpy assumes little endian mode. For big endian mode,
				34	sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
				35	and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
				36	shrp instruction. */
				37
				38	#define USE_LFETCH
				39	#define USE_FLP
				40	#include "sysdep.h"
				41	#undef ret
				42
				43	#define LFETCH_DIST 500
				44
				45	#define ALIGN_UNROLL_no 4 /* no. of elements */
				46	#define ALIGN_UNROLL_sh 2 /* (shift amount) */
				47
				48	#define MEMLAT 8
				49	#define Nrot ((4*(MEMLAT+2) + 7) & ~7)
				50
				51	#define OP_T_THRES 16
				52	#define OPSIZ 8
				53
				54	#define loopcnt r14
				55	#define elemcnt r15
				56	#define saved_pr r16
				57	#define saved_lc r17
				58	#define adest r18
				59	#define dest r19
				60	#define asrc r20
				61	#define src r21
				62	#define len r22
				63	#define tmp2 r23
				64	#define tmp3 r24
				65	#define tmp4 r25
				66	#define ptable r26
				67	#define ploop56 r27
				68	#define loopaddr r28
				69	#define sh1 r29
				70	#define ptr1 r30
				71	#define ptr2 r31
				72
				73	#define movi0 mov
				74
				75	#define p_scr p6
				76	#define p_xtr p7
				77	#define p_nxtr p8
				78	#define p_few p9
				79
				80	#if defined(USE_FLP)
				81	#define load ldf8
				82	#define store stf8
				83	#define tempreg f6
				84	#define the_r fr
				85	#define the_s fs
				86	#define the_t ft
				87	#define the_q fq
				88	#define the_w fw
				89	#define the_x fx
				90	#define the_y fy
				91	#define the_z fz
				92	#elif defined(USE_INT)
				93	#define load ld8
				94	#define store st8
				95	#define tempreg tmp2
				96	#define the_r r
				97	#define the_s s
				98	#define the_t t
				99	#define the_q q
				100	#define the_w w
				101	#define the_x x
				102	#define the_y y
				103	#define the_z z
				104	#endif
				105
				106	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				107	/* Manually force proper loop-alignment. Note: be sure to
				108	double-check the code-layout after making any changes to
				109	this routine! */
				110	# define ALIGN(n) { nop 0 }
				111	#else
				112	# define ALIGN(n) .align n
				113	#endif
				114
				115	#if defined(USE_LFETCH)
				116	#define LOOP(shift) \
				117	ALIGN(32); \
				118	.loop##shift : \
				119	{ .mmb \
				120	(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
				121	(p[0]) lfetch.nt1 [ptr1], 16 ; \
				122	nop.b 0 ; \
				123	} { .mib \
				124	(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
				125	(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
				126	nop.b 0 ;; \
				127	} { .mmb \
				128	(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
				129	(p[0]) lfetch.nt1 [ptr2], 16 ; \
				130	nop.b 0 ; \
				131	} { .mib \
				132	(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
				133	(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
				134	br.ctop.sptk.many .loop##shift \
				135	;; } \
				136	{ .mib \
				137	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
				138	}
				139	#else
				140	#define LOOP(shift) \
				141	ALIGN(32); \
				142	.loop##shift : \
				143	{ .mmb \
				144	(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
				145	nop.b 0 ; \
				146	} { .mib \
				147	(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
				148	(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
				149	nop.b 0 ;; \
				150	} { .mmb \
				151	(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
				152	nop.b 0 ; \
				153	} { .mib \
				154	(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
				155	(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
				156	br.ctop.sptk.many .loop##shift \
				157	;; } \
				158	{ .mib \
				159	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
				160	}
				161	#endif
				162
				163
				164	ENTRY(memcpy)
				165	{ .mmi
				166	.prologue
				167	alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
				168	.rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
				169	.rotp p[MEMLAT+2]
				170	.rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
				171	mov ret0 = in0 /* return tmp2 = dest */
				172	.save pr, saved_pr
				173	movi0 saved_pr = pr /* save the predicate registers */
				174	} { .mmi
				175	and tmp4 = 7, in0 /* check if destination is aligned */
				176	mov dest = in0 /* dest */
				177	mov src = in1 /* src */
				178	;; }
				179	{ .mii
				180	cmp.eq p_scr, p0 = in2, r0 /* if (len == 0) */
				181	.save ar.lc, saved_lc
				182	movi0 saved_lc = ar.lc /* save the loop counter */
				183	.body
				184	cmp.ge p_few, p0 = OP_T_THRES, in2 /* is len <= OP_T_THRESH */
				185	} { .mbb
				186	mov len = in2 /* len */
				187	(p_scr) br.cond.dpnt.few .restore_and_exit /* Branch no. 1: return dest */
				188	(p_few) br.cond.dpnt.many .copy_bytes /* Branch no. 2: copy byte by byte */
				189	;; }
				190	{ .mmi
				191	#if defined(USE_LFETCH)
				192	lfetch.nt1 [dest] /* */
				193	lfetch.nt1 [src] /* */
				194	#endif
				195	shr.u elemcnt = len, 3 /* elemcnt = len / 8 */
				196	} { .mib
				197	cmp.eq p_scr, p0 = tmp4, r0 /* is destination aligned? */
				198	sub loopcnt = 7, tmp4 /* */
				199	(p_scr) br.cond.dptk.many .dest_aligned
				200	;; }
				201	{ .mmi
				202	ld1 tmp2 = [src], 1 /* */
				203	sub len = len, loopcnt, 1 /* reduce len */
				204	movi0 ar.lc = loopcnt /* */
				205	} { .mib
				206	cmp.ne p_scr, p0 = 0, loopcnt /* avoid loading beyond end-point */
				207	;; }
				208
				209	.l0: /* ---------------------------- L0: Align src on 8-byte boundary */
				210	{ .mmi
				211	st1 [dest] = tmp2, 1 /* */
				212	(p_scr) ld1 tmp2 = [src], 1 /* */
				213	} { .mib
				214	cmp.lt p_scr, p0 = 1, loopcnt /* avoid load beyond end-point */
				215	add loopcnt = -1, loopcnt
				216	br.cloop.dptk.few .l0 /* */
				217	;; }
				218
				219	.dest_aligned:
				220	{ .mmi
				221	and tmp4 = 7, src /* ready for alignment check */
				222	shr.u elemcnt = len, 3 /* elemcnt = len / 8 */
				223	;; }
				224	{ .mib
				225	cmp.ne p_scr, p0 = tmp4, r0 /* is source also aligned */
				226	tbit.nz p_xtr, p_nxtr = src, 3 /* prepare a separate move if src */
				227	} { .mib /* is not 16B aligned */
				228	add ptr2 = LFETCH_DIST, dest /* prefetch address */
				229	add ptr1 = LFETCH_DIST, src
				230	(p_scr) br.cond.dptk.many .src_not_aligned
				231	;; }
				232
				233	/* The optimal case, when dest, and src are aligned */
				234
				235	.both_aligned:
				236	{ .mmi
				237	.pred.rel "mutex",p_xtr,p_nxtr
				238	(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt /* Need N + 1 to qualify */
				239	(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt /* Need only N to qualify */
				240	movi0 pr.rot = 1 << 16 /* set rotating predicates */
				241	} { .mib
				242	(p_scr) br.cond.dpnt.many .copy_full_words
				243	;; }
				244
				245	{ .mmi
				246	(p_xtr) load tempreg = [src], 8
				247	(p_xtr) add elemcnt = -1, elemcnt
				248	movi0 ar.ec = MEMLAT + 1 /* set the epilog counter */
				249	;; }
				250	{ .mmi
				251	(p_xtr) add len = -8, len /* */
				252	add asrc = 16, src /* one bank apart (for USE_INT) */
				253	shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh /* cater for unrolling */
				254	;;}
				255	{ .mmi
				256	add loopcnt = -1, loopcnt
				257	(p_xtr) store [dest] = tempreg, 8 /* copy the "extra" word */
				258	nop.i 0
				259	;; }
				260	{ .mib
				261	add adest = 16, dest
				262	movi0 ar.lc = loopcnt /* set the loop counter */
				263	;; }
				264
				265	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				266	{ nop 0 }
				267	#else
				268	.align 32
				269	#endif
				270	#if defined(USE_FLP)
				271	.l1: /* ------------------------------- L1: Everything a multiple of 8 */
				272	{ .mmi
				273	#if defined(USE_LFETCH)
				274	(p[0]) lfetch.nt1 [ptr2],32
				275	#endif
				276	(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
				277	(p[0]) add len = -32, len
				278	} {.mmb
				279	(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
				280	(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
				281	;; }
				282	{ .mmi
				283	#if defined(USE_LFETCH)
				284	(p[0]) lfetch.nt1 [ptr1],32
				285	#endif
				286	(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
				287	} {.mmb
				288	(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
				289	(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
				290	br.ctop.dptk.many .l1
				291	;; }
				292	#elif defined(USE_INT)
				293	.l1: /* ------------------------------- L1: Everything a multiple of 8 */
				294	{ .mmi
				295	(p[0]) load the_r[0] = [src], 8
				296	(p[0]) load the_q[0] = [asrc], 8
				297	(p[0]) add len = -32, len
				298	} {.mmb
				299	(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
				300	(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
				301	;; }
				302	{ .mmi
				303	(p[0]) load the_s[0] = [src], 24
				304	(p[0]) load the_t[0] = [asrc], 24
				305	} {.mmb
				306	(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
				307	(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
				308	#if defined(USE_LFETCH)
				309	;; }
				310	{ .mmb
				311	(p[0]) lfetch.nt1 [ptr2],32
				312	(p[0]) lfetch.nt1 [ptr1],32
				313	#endif
				314	br.ctop.dptk.many .l1
				315	;; }
				316	#endif
				317
				318	.copy_full_words:
				319	{ .mib
				320	cmp.gt p_scr, p0 = 8, len /* */
				321	shr.u elemcnt = len, 3 /* */
				322	(p_scr) br.cond.dpnt.many .copy_bytes
				323	;; }
				324	{ .mii
				325	load tempreg = [src], 8
				326	add loopcnt = -1, elemcnt /* */
				327	;; }
				328	{ .mii
				329	cmp.ne p_scr, p0 = 0, loopcnt /* */
				330	mov ar.lc = loopcnt /* */
				331	;; }
				332
				333	.l2: /* ------------------------------- L2: Max 4 words copied separately */
				334	{ .mmi
				335	store [dest] = tempreg, 8
				336	(p_scr) load tempreg = [src], 8 /* */
				337	add len = -8, len
				338	} { .mib
				339	cmp.lt p_scr, p0 = 1, loopcnt /* avoid load beyond end-point */
				340	add loopcnt = -1, loopcnt
				341	br.cloop.dptk.few .l2
				342	;; }
				343
				344	.copy_bytes:
				345	{ .mib
				346	cmp.eq p_scr, p0 = len, r0 /* is len == 0 ? */
				347	add loopcnt = -1, len /* len--; */
				348	(p_scr) br.cond.spnt .restore_and_exit
				349	;; }
				350	{ .mii
				351	ld1 tmp2 = [src], 1
				352	movi0 ar.lc = loopcnt
				353	cmp.ne p_scr, p0 = 0, loopcnt /* avoid load beyond end-point */
				354	;; }
				355
				356	.l3: /* ------------------------------- L3: Final byte move */
				357	{ .mmi
				358	st1 [dest] = tmp2, 1
				359	(p_scr) ld1 tmp2 = [src], 1
				360	} { .mib
				361	cmp.lt p_scr, p0 = 1, loopcnt /* avoid load beyond end-point */
				362	add loopcnt = -1, loopcnt
				363	br.cloop.dptk.few .l3
				364	;; }
				365
				366	.restore_and_exit:
				367	{ .mmi
				368	movi0 pr = saved_pr, -1 /* restore the predicate registers */
				369	;; }
				370	{ .mib
				371	movi0 ar.lc = saved_lc /* restore the loop counter */
				372	br.ret.sptk.many b0
				373	;; }
				374
				375
				376	.src_not_aligned:
				377	{ .mmi
				378	cmp.gt p_scr, p0 = 16, len
				379	and sh1 = 7, src /* sh1 = src % 8 */
				380	shr.u loopcnt = len, 4 /* element-cnt = len / 16 */
				381	} { .mib
				382	add tmp4 = @ltoff(.table), gp
				383	add tmp3 = @ltoff(.loop56), gp
				384	(p_scr) br.cond.dpnt.many .copy_bytes /* do byte by byte if too few */
				385	;; }
				386	{ .mmi
				387	and asrc = -8, src /* asrc = (-8) -- align src for loop */
				388	add loopcnt = -1, loopcnt /* loopcnt-- */
				389	shl sh1 = sh1, 3 /* sh1 = 8 * (src % 8) */
				390	} { .mmi
				391	ld8 ptable = [tmp4] /* ptable = &table */
				392	ld8 ploop56 = [tmp3] /* ploop56 = &loop56 */
				393	and tmp2 = -16, len /* tmp2 = len & -OPSIZ */
				394	;; }
				395	{ .mmi
				396	add tmp3 = ptable, sh1 /* tmp3 = &table + sh1 */
				397	add src = src, tmp2 /* src += len & (-16) */
				398	movi0 ar.lc = loopcnt /* set LC */
				399	;; }
				400	{ .mmi
				401	ld8 tmp4 = [tmp3] /* tmp4 = loop offset */
				402	sub len = len, tmp2 /* len -= len & (-16) */
				403	movi0 ar.ec = MEMLAT + 2 /* one more pass needed */
				404	;; }
				405	{ .mmi
				406	ld8 s[1] = [asrc], 8 /* preload */
				407	sub loopaddr = ploop56,tmp4 /* loopadd = &loop56 - loop offset */
				408	movi0 pr.rot = 1 << 16 /* set rotating predicates */
				409	;; }
				410	{ .mib
				411	nop.m 0
				412	movi0 b6 = loopaddr
				413	br b6 /* jump to the appropriate loop */
				414	;; }
				415
				416	LOOP(8)
				417	LOOP(16)
				418	LOOP(24)
				419	LOOP(32)
				420	LOOP(40)
				421	LOOP(48)
				422	LOOP(56)
				423	END(memcpy)
				424	libc_hidden_def (memcpy)
				425
				426	.rodata
				427	.align 8
				428	.table:
				429	data8 0 /* dummy entry */
				430	data8 .loop56 - .loop8
				431	data8 .loop56 - .loop16
				432	data8 .loop56 - .loop24
				433	data8 .loop56 - .loop32
				434	data8 .loop56 - .loop40
				435	data8 .loop56 - .loop48
				436	data8 .loop56 - .loop56