Blame - ap/build/uClibc/libc/string/ia64/memset.S - T106_DC

blob: 45df5838ec3e2d73cd2e3052fe3d6a6b51848c33 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame^]	1	/* Optimized version of the standard memset() function.
				2	This file is part of the GNU C Library.
				3	Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
				4	Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
				5	Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
				6
				7	The GNU C Library is free software; you can redistribute it and/or
				8	modify it under the terms of the GNU Lesser General Public
				9	License as published by the Free Software Foundation; either
				10	version 2.1 of the License, or (at your option) any later version.
				11
				12	The GNU C Library is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				15	Lesser General Public License for more details.
				16
				17	You should have received a copy of the GNU Lesser General Public
				18	License along with the GNU C Library; if not, write to the Free
				19	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				20	02111-1307 USA. */
				21
				22	/* Return: dest
				23
				24	Inputs:
				25	in0: dest
				26	in1: value
				27	in2: count
				28
				29	The algorithm is fairly straightforward: set byte by byte until we
				30	we get to a 16B-aligned address, then loop on 128 B chunks using an
				31	early store as prefetching, then loop on 32B chucks, then clear remaining
				32	words, finally clear remaining bytes.
				33	Since a stf.spill f0 can store 16B in one go, we use this instruction
				34	to get peak speed when value = 0. */
				35
				36	#include "sysdep.h"
				37	#undef ret
				38
				39	#define dest in0
				40	#define value in1
				41	#define cnt in2
				42
				43	#define tmp r31
				44	#define save_lc r30
				45	#define ptr0 r29
				46	#define ptr1 r28
				47	#define ptr2 r27
				48	#define ptr3 r26
				49	#define ptr9 r24
				50	#define loopcnt r23
				51	#define linecnt r22
				52	#define bytecnt r21
				53
				54	#define fvalue f6
				55
				56	/* This routine uses only scratch predicate registers (p6 - p15) */
				57	#define p_scr p6 /* default register for same-cycle branches */
				58	#define p_nz p7
				59	#define p_zr p8
				60	#define p_unalgn p9
				61	#define p_y p11
				62	#define p_n p12
				63	#define p_yy p13
				64	#define p_nn p14
				65
				66	#define movi0 mov
				67
				68	#define MIN1 15
				69	#define MIN1P1HALF 8
				70	#define LINE_SIZE 128
				71	#define LSIZE_SH 7 /* shift amount */
				72	#define PREF_AHEAD 8
				73
				74	#define USE_FLP
				75	#if defined(USE_INT)
				76	#define store st8
				77	#define myval value
				78	#elif defined(USE_FLP)
				79	#define store stf8
				80	#define myval fvalue
				81	#endif
				82
				83	.align 64
				84	ENTRY(memset)
				85	{ .mmi
				86	.prologue
				87	alloc tmp = ar.pfs, 3, 0, 0, 0
				88	lfetch.nt1 [dest]
				89	.save ar.lc, save_lc
				90	movi0 save_lc = ar.lc
				91	} { .mmi
				92	.body
				93	mov ret0 = dest /* return value */
				94	cmp.ne p_nz, p_zr = value, r0 /* use stf.spill if value is zero */
				95	cmp.eq p_scr, p0 = cnt, r0
				96	;; }
				97	{ .mmi
				98	and ptr2 = -(MIN1+1), dest /* aligned address */
				99	and tmp = MIN1, dest /* prepare to check for alignment */
				100	tbit.nz p_y, p_n = dest, 0 /* Do we have an odd address? (M_B_U) */
				101	} { .mib
				102	mov ptr1 = dest
				103	mux1 value = value, @brcst /* create 8 identical bytes in word */
				104	(p_scr) br.ret.dpnt.many rp /* return immediately if count = 0 */
				105	;; }
				106	{ .mib
				107	cmp.ne p_unalgn, p0 = tmp, r0
				108	} { .mib /* NB: # of bytes to move is 1 higher */
				109	sub bytecnt = (MIN1+1), tmp /* than loopcnt */
				110	cmp.gt p_scr, p0 = 16, cnt /* is it a minimalistic task? */
				111	(p_scr) br.cond.dptk.many .move_bytes_unaligned /* go move just a few (M_B_U) */
				112	;; }
				113	{ .mmi
				114	(p_unalgn) add ptr1 = (MIN1+1), ptr2 /* after alignment */
				115	(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 /* after alignment */
				116	(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 /* should we do a st8 ? */
				117	;; }
				118	{ .mib
				119	(p_y) add cnt = -8, cnt
				120	(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 /* should we do a st4 ? */
				121	} { .mib
				122	(p_y) st8 [ptr2] = value, -4
				123	(p_n) add ptr2 = 4, ptr2
				124	;; }
				125	{ .mib
				126	(p_yy) add cnt = -4, cnt
				127	(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 /* should we do a st2 ? */
				128	} { .mib
				129	(p_yy) st4 [ptr2] = value, -2
				130	(p_nn) add ptr2 = 2, ptr2
				131	;; }
				132	{ .mmi
				133	mov tmp = LINE_SIZE+1 /* for compare */
				134	(p_y) add cnt = -2, cnt
				135	(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 /* should we do a st1 ? */
				136	} { .mmi
				137	setf.sig fvalue=value /* transfer value to FLP side */
				138	(p_y) st2 [ptr2] = value, -1
				139	(p_n) add ptr2 = 1, ptr2
				140	;; }
				141
				142	{ .mmi
				143	(p_yy) st1 [ptr2] = value
				144	cmp.gt p_scr, p0 = tmp, cnt /* is it a minimalistic task? */
				145	} { .mbb
				146	(p_yy) add cnt = -1, cnt
				147	(p_scr) br.cond.dpnt.many .fraction_of_line /* go move just a few */
				148	;; }
				149
				150	{ .mib
				151	nop.m 0
				152	shr.u linecnt = cnt, LSIZE_SH
				153	(p_zr) br.cond.dptk.many .l1b /* Jump to use stf.spill */
				154	;; }
				155
				156	#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
				157	.align 32 /* -------- L1A: store ahead into cache lines; fill later */
				158	#endif
				159	{ .mmi
				160	and tmp = -(LINE_SIZE), cnt /* compute end of range */
				161	mov ptr9 = ptr1 /* used for prefetching */
				162	and cnt = (LINE_SIZE-1), cnt /* remainder */
				163	} { .mmi
				164	mov loopcnt = PREF_AHEAD-1 /* default prefetch loop */
				165	cmp.gt p_scr, p0 = PREF_AHEAD, linecnt /* check against actual value */
				166	;; }
				167	{ .mmi
				168	(p_scr) add loopcnt = -1, linecnt /* start of stores */
				169	add ptr2 = 8, ptr1 /* (beyond prefetch stores) */
				170	add ptr1 = tmp, ptr1 /* first address beyond total */
				171	;; } /* range */
				172	{ .mmi
				173	add tmp = -1, linecnt /* next loop count */
				174	movi0 ar.lc = loopcnt
				175	;; }
				176	.pref_l1a:
				177	{ .mib
				178	store [ptr9] = myval, 128 /* Do stores one cache line apart */
				179	nop.i 0
				180	br.cloop.dptk.few .pref_l1a
				181	;; }
				182	{ .mmi
				183	add ptr0 = 16, ptr2 /* Two stores in parallel */
				184	movi0 ar.lc = tmp
				185	;; }
				186	.l1ax:
				187	{ .mmi
				188	store [ptr2] = myval, 8
				189	store [ptr0] = myval, 8
				190	;; }
				191	{ .mmi
				192	store [ptr2] = myval, 24
				193	store [ptr0] = myval, 24
				194	;; }
				195	{ .mmi
				196	store [ptr2] = myval, 8
				197	store [ptr0] = myval, 8
				198	;; }
				199	{ .mmi
				200	store [ptr2] = myval, 24
				201	store [ptr0] = myval, 24
				202	;; }
				203	{ .mmi
				204	store [ptr2] = myval, 8
				205	store [ptr0] = myval, 8
				206	;; }
				207	{ .mmi
				208	store [ptr2] = myval, 24
				209	store [ptr0] = myval, 24
				210	;; }
				211	{ .mmi
				212	store [ptr2] = myval, 8
				213	store [ptr0] = myval, 32
				214	cmp.lt p_scr, p0 = ptr9, ptr1 /* do we need more prefetching? */
				215	;; }
				216	{ .mmb
				217	store [ptr2] = myval, 24
				218	(p_scr) store [ptr9] = myval, 128
				219	br.cloop.dptk.few .l1ax
				220	;; }
				221	{ .mbb
				222	cmp.le p_scr, p0 = 8, cnt /* just a few bytes left ? */
				223	(p_scr) br.cond.dpnt.many .fraction_of_line /* Branch no. 2 */
				224	br.cond.dpnt.many .move_bytes_from_alignment /* Branch no. 3 */
				225	;; }
				226
				227	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
				228	{ nop 0 }
				229	#else
				230	.align 32
				231	#endif
				232	.l1b: /* ------------------ L1B: store ahead into cache lines; fill later */
				233	{ .mmi
				234	and tmp = -(LINE_SIZE), cnt /* compute end of range */
				235	mov ptr9 = ptr1 /* used for prefetching */
				236	and cnt = (LINE_SIZE-1), cnt /* remainder */
				237	} { .mmi
				238	mov loopcnt = PREF_AHEAD-1 /* default prefetch loop */
				239	cmp.gt p_scr, p0 = PREF_AHEAD, linecnt /* check against actual value */
				240	;; }
				241	{ .mmi
				242	(p_scr) add loopcnt = -1, linecnt
				243	add ptr2 = 16, ptr1 /* start of stores (beyond prefetch stores) */
				244	add ptr1 = tmp, ptr1 /* first address beyond total range */
				245	;; }
				246	{ .mmi
				247	add tmp = -1, linecnt /* next loop count */
				248	movi0 ar.lc = loopcnt
				249	;; }
				250	.pref_l1b:
				251	{ .mib
				252	stf.spill [ptr9] = f0, 128 /* Do stores one cache line apart */
				253	nop.i 0
				254	br.cloop.dptk.few .pref_l1b
				255	;; }
				256	{ .mmi
				257	add ptr0 = 16, ptr2 /* Two stores in parallel */
				258	movi0 ar.lc = tmp
				259	;; }
				260	.l1bx:
				261	{ .mmi
				262	stf.spill [ptr2] = f0, 32
				263	stf.spill [ptr0] = f0, 32
				264	;; }
				265	{ .mmi
				266	stf.spill [ptr2] = f0, 32
				267	stf.spill [ptr0] = f0, 32
				268	;; }
				269	{ .mmi
				270	stf.spill [ptr2] = f0, 32
				271	stf.spill [ptr0] = f0, 64
				272	cmp.lt p_scr, p0 = ptr9, ptr1 /* do we need more prefetching? */
				273	;; }
				274	{ .mmb
				275	stf.spill [ptr2] = f0, 32
				276	(p_scr) stf.spill [ptr9] = f0, 128
				277	br.cloop.dptk.few .l1bx
				278	;; }
				279	{ .mib
				280	cmp.gt p_scr, p0 = 8, cnt /* just a few bytes left ? */
				281	(p_scr) br.cond.dpnt.many .move_bytes_from_alignment
				282	;; }
				283
				284	.fraction_of_line:
				285	{ .mib
				286	add ptr2 = 16, ptr1
				287	shr.u loopcnt = cnt, 5 /* loopcnt = cnt / 32 */
				288	;; }
				289	{ .mib
				290	cmp.eq p_scr, p0 = loopcnt, r0
				291	add loopcnt = -1, loopcnt
				292	(p_scr) br.cond.dpnt.many store_words
				293	;; }
				294	{ .mib
				295	and cnt = 0x1f, cnt /* compute the remaining cnt */
				296	movi0 ar.lc = loopcnt
				297	;; }
				298	#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
				299	.align 32
				300	#endif
				301	.l2: /* ---------------------------- L2A: store 32B in 2 cycles */
				302	{ .mmb
				303	store [ptr1] = myval, 8
				304	store [ptr2] = myval, 8
				305	;; } { .mmb
				306	store [ptr1] = myval, 24
				307	store [ptr2] = myval, 24
				308	br.cloop.dptk.many .l2
				309	;; }
				310	store_words:
				311	{ .mib
				312	cmp.gt p_scr, p0 = 8, cnt /* just a few bytes left ? */
				313	(p_scr) br.cond.dpnt.many .move_bytes_from_alignment /* Branch */
				314	;; }
				315
				316	{ .mmi
				317	store [ptr1] = myval, 8 /* store */
				318	cmp.le p_y, p_n = 16, cnt /* */
				319	add cnt = -8, cnt /* subtract */
				320	;; }
				321	{ .mmi
				322	(p_y) store [ptr1] = myval, 8 /* store */
				323	(p_y) cmp.le.unc p_yy, p_nn = 16, cnt /* */
				324	(p_y) add cnt = -8, cnt /* subtract */
				325	;; }
				326	{ .mmi /* store */
				327	(p_yy) store [ptr1] = myval, 8 /* */
				328	(p_yy) add cnt = -8, cnt /* subtract */
				329	;; }
				330
				331	.move_bytes_from_alignment:
				332	{ .mib
				333	cmp.eq p_scr, p0 = cnt, r0
				334	tbit.nz.unc p_y, p0 = cnt, 2 /* should we terminate with a st4 ? */
				335	(p_scr) br.cond.dpnt.few .restore_and_exit
				336	;; }
				337	{ .mib
				338	(p_y) st4 [ptr1] = value, 4
				339	tbit.nz.unc p_yy, p0 = cnt, 1 /* should we terminate with a st2 ? */
				340	;; }
				341	{ .mib
				342	(p_yy) st2 [ptr1] = value, 2
				343	tbit.nz.unc p_y, p0 = cnt, 0
				344	;; }
				345
				346	{ .mib
				347	(p_y) st1 [ptr1] = value
				348	;; }
				349	.restore_and_exit:
				350	{ .mib
				351	nop.m 0
				352	movi0 ar.lc = save_lc
				353	br.ret.sptk.many rp
				354	;; }
				355
				356	.move_bytes_unaligned:
				357	{ .mmi
				358	.pred.rel "mutex",p_y, p_n
				359	.pred.rel "mutex",p_yy, p_nn
				360	(p_n) cmp.le p_yy, p_nn = 4, cnt
				361	(p_y) cmp.le p_yy, p_nn = 5, cnt
				362	(p_n) add ptr2 = 2, ptr1
				363	} { .mmi
				364	(p_y) add ptr2 = 3, ptr1
				365	(p_y) st1 [ptr1] = value, 1 /* fill 1 (odd-aligned) byte */
				366	(p_y) add cnt = -1, cnt /* [15, 14 (or less) left] */
				367	;; }
				368	{ .mmi
				369	(p_yy) cmp.le.unc p_y, p0 = 8, cnt
				370	add ptr3 = ptr1, cnt /* prepare last store */
				371	movi0 ar.lc = save_lc
				372	} { .mmi
				373	(p_yy) st2 [ptr1] = value, 4 /* fill 2 (aligned) bytes */
				374	(p_yy) st2 [ptr2] = value, 4 /* fill 2 (aligned) bytes */
				375	(p_yy) add cnt = -4, cnt /* [11, 10 (o less) left] */
				376	;; }
				377	{ .mmi
				378	(p_y) cmp.le.unc p_yy, p0 = 8, cnt
				379	add ptr3 = -1, ptr3 /* last store */
				380	tbit.nz p_scr, p0 = cnt, 1 /* will there be a st2 at the end ? */
				381	} { .mmi
				382	(p_y) st2 [ptr1] = value, 4 /* fill 2 (aligned) bytes */
				383	(p_y) st2 [ptr2] = value, 4 /* fill 2 (aligned) bytes */
				384	(p_y) add cnt = -4, cnt /* [7, 6 (or less) left] */
				385	;; }
				386	{ .mmi
				387	(p_yy) st2 [ptr1] = value, 4 /* fill 2 (aligned) bytes */
				388	(p_yy) st2 [ptr2] = value, 4 /* fill 2 (aligned) bytes */
				389	/* [3, 2 (or less) left] */
				390	tbit.nz p_y, p0 = cnt, 0 /* will there be a st1 at the end ? */
				391	} { .mmi
				392	(p_yy) add cnt = -4, cnt
				393	;; }
				394	{ .mmb
				395	(p_scr) st2 [ptr1] = value /* fill 2 (aligned) bytes */
				396	(p_y) st1 [ptr3] = value /* fill last byte (using ptr3) */
				397	br.ret.sptk.many rp
				398	;; }
				399	END(memset)
				400	libc_hidden_def (memset)