Blame - ap/libc/glibc/glibc-2.23/sysdeps/ia64/memcpy.S - T106_DC

blob: f355ce48f3ed32bc6db56b30b38cf08dd7f9b459 [file] [log] [blame]

xf.li	bdd93d5	2023-05-12 07:10:14 -0700	[diff] [blame]	1	/* Optimized version of the standard memcpy() function.
				2	This file is part of the GNU C Library.
				3	Copyright (C) 2000-2016 Free Software Foundation, Inc.
				4	Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
				5	Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
				6
				7	The GNU C Library is free software; you can redistribute it and/or
				8	modify it under the terms of the GNU Lesser General Public
				9	License as published by the Free Software Foundation; either
				10	version 2.1 of the License, or (at your option) any later version.
				11
				12	The GNU C Library is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				15	Lesser General Public License for more details.
				16
				17	You should have received a copy of the GNU Lesser General Public
				18	License along with the GNU C Library; if not, see
				19	<http://www.gnu.org/licenses/>. */
				20
				21	/* Return: dest
				22
				23	Inputs:
				24	in0: dest
				25	in1: src
				26	in2: byte count
				27
				28	An assembly implementation of the algorithm used by the generic C
				29	version from glibc. The case when source and sest are aligned is
				30	treated separately, for extra performance.
				31
				32	In this form, memcpy assumes little endian mode. For big endian mode,
				33	sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
				34	and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
				35	shrp instruction. */
				36
				37	#define USE_LFETCH
				38	#define USE_FLP
				39	#include <sysdep.h>
				40	#undef ret
				41
				42	#define LFETCH_DIST 500
				43
				44	#define ALIGN_UNROLL_no 4 // no. of elements
				45	#define ALIGN_UNROLL_sh 2 // (shift amount)
				46
				47	#define MEMLAT 8
				48	#define Nrot ((4*(MEMLAT+2) + 7) & ~7)
				49
				50	#define OP_T_THRES 16
				51	#define OPSIZ 8
				52
				53	#define loopcnt r14
				54	#define elemcnt r15
				55	#define saved_pr r16
				56	#define saved_lc r17
				57	#define adest r18
				58	#define dest r19
				59	#define asrc r20
				60	#define src r21
				61	#define len r22
				62	#define tmp2 r23
				63	#define tmp3 r24
				64	#define tmp4 r25
				65	#define ptable r26
				66	#define ploop56 r27
				67	#define loopaddr r28
				68	#define sh1 r29
				69	#define ptr1 r30
				70	#define ptr2 r31
				71
				72	#define movi0 mov
				73
				74	#define p_scr p6
				75	#define p_xtr p7
				76	#define p_nxtr p8
				77	#define p_few p9
				78
				79	#if defined(USE_FLP)
				80	#define load ldf8
				81	#define store stf8
				82	#define tempreg f6
				83	#define the_r fr
				84	#define the_s fs
				85	#define the_t ft
				86	#define the_q fq
				87	#define the_w fw
				88	#define the_x fx
				89	#define the_y fy
				90	#define the_z fz
				91	#elif defined(USE_INT)
				92	#define load ld8
				93	#define store st8
				94	#define tempreg tmp2
				95	#define the_r r
				96	#define the_s s
				97	#define the_t t
				98	#define the_q q
				99	#define the_w w
				100	#define the_x x
				101	#define the_y y
				102	#define the_z z
				103	#endif
				104
				105	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				106	/* Manually force proper loop-alignment. Note: be sure to
				107	double-check the code-layout after making any changes to
				108	this routine! */
				109	# define ALIGN(n) { nop 0 }
				110	#else
				111	# define ALIGN(n) .align n
				112	#endif
				113
				114	#if defined(USE_LFETCH)
				115	#define LOOP(shift) \
				116	ALIGN(32); \
				117	.loop##shift##: \
				118	{ .mmb \
				119	(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
				120	(p[0]) lfetch.nt1 [ptr1], 16 ; \
				121	nop.b 0 ; \
				122	} { .mib \
				123	(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
				124	(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
				125	nop.b 0 ;; \
				126	} { .mmb \
				127	(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
				128	(p[0]) lfetch.nt1 [ptr2], 16 ; \
				129	nop.b 0 ; \
				130	} { .mib \
				131	(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
				132	(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
				133	br.ctop.sptk.many .loop##shift \
				134	;; } \
				135	{ .mib \
				136	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
				137	}
				138	#else
				139	#define LOOP(shift) \
				140	ALIGN(32); \
				141	.loop##shift##: \
				142	{ .mmb \
				143	(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
				144	nop.b 0 ; \
				145	} { .mib \
				146	(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
				147	(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
				148	nop.b 0 ;; \
				149	} { .mmb \
				150	(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
				151	nop.b 0 ; \
				152	} { .mib \
				153	(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
				154	(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
				155	br.ctop.sptk.many .loop##shift \
				156	;; } \
				157	{ .mib \
				158	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
				159	}
				160	#endif
				161
				162
				163	ENTRY(memcpy)
				164	{ .mmi
				165	.prologue
				166	alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
				167	.rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
				168	.rotp p[MEMLAT+2]
				169	.rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
				170	mov ret0 = in0 // return tmp2 = dest
				171	.save pr, saved_pr
				172	movi0 saved_pr = pr // save the predicate registers
				173	} { .mmi
				174	and tmp4 = 7, in0 // check if destination is aligned
				175	mov dest = in0 // dest
				176	mov src = in1 // src
				177	;; }
				178	{ .mii
				179	cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
				180	.save ar.lc, saved_lc
				181	movi0 saved_lc = ar.lc // save the loop counter
				182	.body
				183	cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
				184	} { .mbb
				185	mov len = in2 // len
				186	(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
				187	(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
				188	;; }
				189	{ .mmi
				190	#if defined(USE_LFETCH)
				191	lfetch.nt1 [dest] //
				192	lfetch.nt1 [src] //
				193	#endif
				194	shr.u elemcnt = len, 3 // elemcnt = len / 8
				195	} { .mib
				196	cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
				197	sub loopcnt = 7, tmp4 //
				198	(p_scr) br.cond.dptk.many .dest_aligned
				199	;; }
				200	{ .mmi
				201	ld1 tmp2 = [src], 1 //
				202	sub len = len, loopcnt, 1 // reduce len
				203	movi0 ar.lc = loopcnt //
				204	} { .mib
				205	cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
				206	;; }
				207
				208	.l0: // ---------------------------- // L0: Align src on 8-byte boundary
				209	{ .mmi
				210	st1 [dest] = tmp2, 1 //
				211	(p_scr) ld1 tmp2 = [src], 1 //
				212	} { .mib
				213	cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
				214	add loopcnt = -1, loopcnt
				215	br.cloop.dptk.few .l0 //
				216	;; }
				217
				218	.dest_aligned:
				219	{ .mmi
				220	and tmp4 = 7, src // ready for alignment check
				221	shr.u elemcnt = len, 3 // elemcnt = len / 8
				222	;; }
				223	{ .mib
				224	cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
				225	tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
				226	} { .mib // is not 16B aligned
				227	add ptr2 = LFETCH_DIST, dest // prefetch address
				228	add ptr1 = LFETCH_DIST, src
				229	(p_scr) br.cond.dptk.many .src_not_aligned
				230	;; }
				231
				232	// The optimal case, when dest, and src are aligned
				233
				234	.both_aligned:
				235	{ .mmi
				236	.pred.rel "mutex",p_xtr,p_nxtr
				237	(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
				238	(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
				239	movi0 pr.rot = 1 << 16 // set rotating predicates
				240	} { .mib
				241	(p_scr) br.cond.dpnt.many .copy_full_words
				242	;; }
				243
				244	{ .mmi
				245	(p_xtr) load tempreg = [src], 8
				246	(p_xtr) add elemcnt = -1, elemcnt
				247	movi0 ar.ec = MEMLAT + 1 // set the epilog counter
				248	;; }
				249	{ .mmi
				250	(p_xtr) add len = -8, len //
				251	add asrc = 16, src // one bank apart (for USE_INT)
				252	shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
				253	;;}
				254	{ .mmi
				255	add loopcnt = -1, loopcnt
				256	(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
				257	nop.i 0
				258	;; }
				259	{ .mib
				260	add adest = 16, dest
				261	movi0 ar.lc = loopcnt // set the loop counter
				262	;; }
				263
				264	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				265	{ nop 0 }
				266	#else
				267	.align 32
				268	#endif
				269	#if defined(USE_FLP)
				270	.l1: // ------------------------------- // L1: Everything a multiple of 8
				271	{ .mmi
				272	#if defined(USE_LFETCH)
				273	(p[0]) lfetch.nt1 [ptr2],32
				274	#endif
				275	(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
				276	(p[0]) add len = -32, len
				277	} {.mmb
				278	(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
				279	(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
				280	;; }
				281	{ .mmi
				282	#if defined(USE_LFETCH)
				283	(p[0]) lfetch.nt1 [ptr1],32
				284	#endif
				285	(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
				286	} {.mmb
				287	(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
				288	(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
				289	br.ctop.dptk.many .l1
				290	;; }
				291	#elif defined(USE_INT)
				292	.l1: // ------------------------------- // L1: Everything a multiple of 8
				293	{ .mmi
				294	(p[0]) load the_r[0] = [src], 8
				295	(p[0]) load the_q[0] = [asrc], 8
				296	(p[0]) add len = -32, len
				297	} {.mmb
				298	(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
				299	(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
				300	;; }
				301	{ .mmi
				302	(p[0]) load the_s[0] = [src], 24
				303	(p[0]) load the_t[0] = [asrc], 24
				304	} {.mmb
				305	(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
				306	(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
				307	#if defined(USE_LFETCH)
				308	;; }
				309	{ .mmb
				310	(p[0]) lfetch.nt1 [ptr2],32
				311	(p[0]) lfetch.nt1 [ptr1],32
				312	#endif
				313	br.ctop.dptk.many .l1
				314	;; }
				315	#endif
				316
				317	.copy_full_words:
				318	{ .mib
				319	cmp.gt p_scr, p0 = 8, len //
				320	shr.u elemcnt = len, 3 //
				321	(p_scr) br.cond.dpnt.many .copy_bytes
				322	;; }
				323	{ .mii
				324	load tempreg = [src], 8
				325	add loopcnt = -1, elemcnt //
				326	;; }
				327	{ .mii
				328	cmp.ne p_scr, p0 = 0, loopcnt //
				329	mov ar.lc = loopcnt //
				330	;; }
				331
				332	.l2: // ------------------------------- // L2: Max 4 words copied separately
				333	{ .mmi
				334	store [dest] = tempreg, 8
				335	(p_scr) load tempreg = [src], 8 //
				336	add len = -8, len
				337	} { .mib
				338	cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
				339	add loopcnt = -1, loopcnt
				340	br.cloop.dptk.few .l2
				341	;; }
				342
				343	.copy_bytes:
				344	{ .mib
				345	cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
				346	add loopcnt = -1, len // len--;
				347	(p_scr) br.cond.spnt .restore_and_exit
				348	;; }
				349	{ .mii
				350	ld1 tmp2 = [src], 1
				351	movi0 ar.lc = loopcnt
				352	cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
				353	;; }
				354
				355	.l3: // ------------------------------- // L3: Final byte move
				356	{ .mmi
				357	st1 [dest] = tmp2, 1
				358	(p_scr) ld1 tmp2 = [src], 1
				359	} { .mib
				360	cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
				361	add loopcnt = -1, loopcnt
				362	br.cloop.dptk.few .l3
				363	;; }
				364
				365	.restore_and_exit:
				366	{ .mmi
				367	movi0 pr = saved_pr, -1 // restore the predicate registers
				368	;; }
				369	{ .mib
				370	movi0 ar.lc = saved_lc // restore the loop counter
				371	br.ret.sptk.many b0
				372	;; }
				373
				374
				375	.src_not_aligned:
				376	{ .mmi
				377	cmp.gt p_scr, p0 = 16, len
				378	and sh1 = 7, src // sh1 = src % 8
				379	shr.u loopcnt = len, 4 // element-cnt = len / 16
				380	} { .mib
				381	add tmp4 = @ltoff(.table), gp
				382	add tmp3 = @ltoff(.loop56), gp
				383	(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
				384	;; }
				385	{ .mmi
				386	and asrc = -8, src // asrc = (-8) -- align src for loop
				387	add loopcnt = -1, loopcnt // loopcnt--
				388	shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
				389	} { .mmi
				390	ld8 ptable = [tmp4] // ptable = &table
				391	ld8 ploop56 = [tmp3] // ploop56 = &loop56
				392	and tmp2 = -16, len // tmp2 = len & -OPSIZ
				393	;; }
				394	{ .mmi
				395	add tmp3 = ptable, sh1 // tmp3 = &table + sh1
				396	add src = src, tmp2 // src += len & (-16)
				397	movi0 ar.lc = loopcnt // set LC
				398	;; }
				399	{ .mmi
				400	ld8 tmp4 = [tmp3] // tmp4 = loop offset
				401	sub len = len, tmp2 // len -= len & (-16)
				402	movi0 ar.ec = MEMLAT + 2 // one more pass needed
				403	;; }
				404	{ .mmi
				405	ld8 s[1] = [asrc], 8 // preload
				406	sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
				407	movi0 pr.rot = 1 << 16 // set rotating predicates
				408	;; }
				409	{ .mib
				410	nop.m 0
				411	movi0 b6 = loopaddr
				412	br b6 // jump to the appropriate loop
				413	;; }
				414
				415	LOOP(8)
				416	LOOP(16)
				417	LOOP(24)
				418	LOOP(32)
				419	LOOP(40)
				420	LOOP(48)
				421	LOOP(56)
				422	END(memcpy)
				423	libc_hidden_builtin_def (memcpy)
				424
				425	.rodata
				426	.align 8
				427	.table:
				428	data8 0 // dummy entry
				429	data8 .loop56 - .loop8
				430	data8 .loop56 - .loop16
				431	data8 .loop56 - .loop24
				432	data8 .loop56 - .loop32
				433	data8 .loop56 - .loop40
				434	data8 .loop56 - .loop48
				435	data8 .loop56 - .loop56