Blame - ap/build/uClibc/libc/string/ia64/bzero.S - T106_DC

blob: 1f0f8b7ac48dde43b11eb52d1ad134b4b28bde8d [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* Optimized version of the standard bzero() function.
				2	This file is part of the GNU C Library.
				3	Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
				4	Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
				5	Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
				6
				7	The GNU C Library is free software; you can redistribute it and/or
				8	modify it under the terms of the GNU Lesser General Public
				9	License as published by the Free Software Foundation; either
				10	version 2.1 of the License, or (at your option) any later version.
				11
				12	The GNU C Library is distributed in the hope that it will be useful,
				13	but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				15	Lesser General Public License for more details.
				16
				17	You should have received a copy of the GNU Lesser General Public
				18	License along with the GNU C Library; if not, write to the Free
				19	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				20	02111-1307 USA. */
				21
				22	/* Return: dest
				23
				24	Inputs:
				25	in0: dest
				26	in1: count
				27
				28	The algorithm is fairly straightforward: set byte by byte until we
				29	we get to a 16B-aligned address, then loop on 128 B chunks using an
				30	early store as prefetching, then loop on 32B chucks, then clear remaining
				31	words, finally clear remaining bytes.
				32	Since a stf.spill f0 can store 16B in one go, we use this instruction
				33	to get peak speed. */
				34
				35	#include "sysdep.h"
				36
				37	#ifdef __UCLIBC_SUSV3_LEGACY__
				38
				39	#undef ret
				40
				41	#define dest in0
				42	#define cnt in1
				43
				44	#define tmp r31
				45	#define save_lc r30
				46	#define ptr0 r29
				47	#define ptr1 r28
				48	#define ptr2 r27
				49	#define ptr3 r26
				50	#define ptr9 r24
				51	#define loopcnt r23
				52	#define linecnt r22
				53	#define bytecnt r21
				54
				55	/* This routine uses only scratch predicate registers (p6 - p15) */
				56	#define p_scr p6 /* default register for same-cycle branches */
				57	#define p_unalgn p9
				58	#define p_y p11
				59	#define p_n p12
				60	#define p_yy p13
				61	#define p_nn p14
				62
				63	#define movi0 mov
				64
				65	#define MIN1 15
				66	#define MIN1P1HALF 8
				67	#define LINE_SIZE 128
				68	#define LSIZE_SH 7 /* shift amount */
				69	#define PREF_AHEAD 8
				70
				71	#define USE_FLP
				72	#if defined(USE_INT)
				73	#define store st8
				74	#define myval r0
				75	#elif defined(USE_FLP)
				76	#define store stf8
				77	#define myval f0
				78	#endif
				79
				80	.align 64
				81	ENTRY(bzero)
				82	{ .mmi
				83	.prologue
				84	alloc tmp = ar.pfs, 2, 0, 0, 0
				85	lfetch.nt1 [dest]
				86	.save ar.lc, save_lc
				87	movi0 save_lc = ar.lc
				88	} { .mmi
				89	.body
				90	mov ret0 = dest /* return value */
				91	nop.m 0
				92	cmp.eq p_scr, p0 = cnt, r0
				93	;; }
				94	{ .mmi
				95	and ptr2 = -(MIN1+1), dest /* aligned address */
				96	and tmp = MIN1, dest /* prepare to check for alignment */
				97	tbit.nz p_y, p_n = dest, 0 /* Do we have an odd address? (M_B_U) */
				98	} { .mib
				99	mov ptr1 = dest
				100	nop.i 0
				101	(p_scr) br.ret.dpnt.many rp /* return immediately if count = 0 */
				102	;; }
				103	{ .mib
				104	cmp.ne p_unalgn, p0 = tmp, r0
				105	} { .mib /* NB: # of bytes to move is 1 */
				106	sub bytecnt = (MIN1+1), tmp /* higher than loopcnt */
				107	cmp.gt p_scr, p0 = 16, cnt /* is it a minimalistic task? */
				108	(p_scr) br.cond.dptk.many .move_bytes_unaligned /* go move just a few (M_B_U) */
				109	;; }
				110	{ .mmi
				111	(p_unalgn) add ptr1 = (MIN1+1), ptr2 /* after alignment */
				112	(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 /* after alignment */
				113	(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 /* should we do a st8 ? */
				114	;; }
				115	{ .mib
				116	(p_y) add cnt = -8, cnt
				117	(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 /* should we do a st4 ? */
				118	} { .mib
				119	(p_y) st8 [ptr2] = r0,-4
				120	(p_n) add ptr2 = 4, ptr2
				121	;; }
				122	{ .mib
				123	(p_yy) add cnt = -4, cnt
				124	(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 /* should we do a st2 ? */
				125	} { .mib
				126	(p_yy) st4 [ptr2] = r0,-2
				127	(p_nn) add ptr2 = 2, ptr2
				128	;; }
				129	{ .mmi
				130	mov tmp = LINE_SIZE+1 /* for compare */
				131	(p_y) add cnt = -2, cnt
				132	(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 /* should we do a st1 ? */
				133	} { .mmi
				134	nop.m 0
				135	(p_y) st2 [ptr2] = r0,-1
				136	(p_n) add ptr2 = 1, ptr2
				137	;; }
				138
				139	{ .mmi
				140	(p_yy) st1 [ptr2] = r0
				141	cmp.gt p_scr, p0 = tmp, cnt /* is it a minimalistic task? */
				142	} { .mbb
				143	(p_yy) add cnt = -1, cnt
				144	(p_scr) br.cond.dpnt.many .fraction_of_line /* go move just a few */
				145	;; }
				146	{ .mib
				147	nop.m 0
				148	shr.u linecnt = cnt, LSIZE_SH
				149	nop.b 0
				150	;; }
				151
				152	.align 32
				153	.l1b: /* ------------------ L1B: store ahead into cache lines; fill later */
				154	{ .mmi
				155	and tmp = -(LINE_SIZE), cnt /* compute end of range */
				156	mov ptr9 = ptr1 /* used for prefetching */
				157	and cnt = (LINE_SIZE-1), cnt /* remainder */
				158	} { .mmi
				159	mov loopcnt = PREF_AHEAD-1 /* default prefetch loop */
				160	cmp.gt p_scr, p0 = PREF_AHEAD, linecnt /* check against actual value */
				161	;; }
				162	{ .mmi
				163	(p_scr) add loopcnt = -1, linecnt
				164	add ptr2 = 16, ptr1 /* start of stores (beyond prefetch stores) */
				165	add ptr1 = tmp, ptr1 /* first address beyond total range */
				166	;; }
				167	{ .mmi
				168	add tmp = -1, linecnt /* next loop count */
				169	movi0 ar.lc = loopcnt
				170	;; }
				171	.pref_l1b:
				172	{ .mib
				173	stf.spill [ptr9] = f0, 128 /* Do stores one cache line apart */
				174	nop.i 0
				175	br.cloop.dptk.few .pref_l1b
				176	;; }
				177	{ .mmi
				178	add ptr0 = 16, ptr2 /* Two stores in parallel */
				179	movi0 ar.lc = tmp
				180	;; }
				181	.l1bx:
				182	{ .mmi
				183	stf.spill [ptr2] = f0, 32
				184	stf.spill [ptr0] = f0, 32
				185	;; }
				186	{ .mmi
				187	stf.spill [ptr2] = f0, 32
				188	stf.spill [ptr0] = f0, 32
				189	;; }
				190	{ .mmi
				191	stf.spill [ptr2] = f0, 32
				192	stf.spill [ptr0] = f0, 64
				193	cmp.lt p_scr, p0 = ptr9, ptr1 /* do we need more prefetching? */
				194	;; }
				195	{ .mmb
				196	stf.spill [ptr2] = f0, 32
				197	(p_scr) stf.spill [ptr9] = f0, 128
				198	br.cloop.dptk.few .l1bx
				199	;; }
				200	{ .mib
				201	cmp.gt p_scr, p0 = 8, cnt /* just a few bytes left ? */
				202	(p_scr) br.cond.dpnt.many .move_bytes_from_alignment
				203	;; }
				204
				205	.fraction_of_line:
				206	{ .mib
				207	add ptr2 = 16, ptr1
				208	shr.u loopcnt = cnt, 5 /* loopcnt = cnt / 32 */
				209	;; }
				210	{ .mib
				211	cmp.eq p_scr, p0 = loopcnt, r0
				212	add loopcnt = -1, loopcnt
				213	(p_scr) br.cond.dpnt.many .store_words
				214	;; }
				215	{ .mib
				216	and cnt = 0x1f, cnt /* compute the remaining cnt */
				217	movi0 ar.lc = loopcnt
				218	;; }
				219	.align 32
				220	.l2: /* ----------------------------- L2A: store 32B in 2 cycles */
				221	{ .mmb
				222	store [ptr1] = myval, 8
				223	store [ptr2] = myval, 8
				224	;; } { .mmb
				225	store [ptr1] = myval, 24
				226	store [ptr2] = myval, 24
				227	br.cloop.dptk.many .l2
				228	;; }
				229	.store_words:
				230	{ .mib
				231	cmp.gt p_scr, p0 = 8, cnt /* just a few bytes left ? */
				232	(p_scr) br.cond.dpnt.many .move_bytes_from_alignment /* Branch */
				233	;; }
				234
				235	{ .mmi
				236	store [ptr1] = myval, 8 /* store */
				237	cmp.le p_y, p_n = 16, cnt /* */
				238	add cnt = -8, cnt /* subtract */
				239	;; }
				240	{ .mmi
				241	(p_y) store [ptr1] = myval, 8 /* store */
				242	(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
				243	(p_y) add cnt = -8, cnt /* subtract */
				244	;; }
				245	{ .mmi /* store */
				246	(p_yy) store [ptr1] = myval, 8
				247	(p_yy) add cnt = -8, cnt /* subtract */
				248	;; }
				249
				250	.move_bytes_from_alignment:
				251	{ .mib
				252	cmp.eq p_scr, p0 = cnt, r0
				253	tbit.nz.unc p_y, p0 = cnt, 2 /* should we terminate with a st4 ? */
				254	(p_scr) br.cond.dpnt.few .restore_and_exit
				255	;; }
				256	{ .mib
				257	(p_y) st4 [ptr1] = r0,4
				258	tbit.nz.unc p_yy, p0 = cnt, 1 /* should we terminate with a st2 ? */
				259	;; }
				260	{ .mib
				261	(p_yy) st2 [ptr1] = r0,2
				262	tbit.nz.unc p_y, p0 = cnt, 0 /* should we terminate with a st1 ? */
				263	;; }
				264
				265	{ .mib
				266	(p_y) st1 [ptr1] = r0
				267	;; }
				268	.restore_and_exit:
				269	{ .mib
				270	nop.m 0
				271	movi0 ar.lc = save_lc
				272	br.ret.sptk.many rp
				273	;; }
				274
				275	.move_bytes_unaligned:
				276	{ .mmi
				277	.pred.rel "mutex",p_y, p_n
				278	.pred.rel "mutex",p_yy, p_nn
				279	(p_n) cmp.le p_yy, p_nn = 4, cnt
				280	(p_y) cmp.le p_yy, p_nn = 5, cnt
				281	(p_n) add ptr2 = 2, ptr1
				282	} { .mmi
				283	(p_y) add ptr2 = 3, ptr1
				284	(p_y) st1 [ptr1] = r0, 1 /* fill 1 (odd-aligned) byte */
				285	(p_y) add cnt = -1, cnt /* [15, 14 (or less) left] */
				286	;; }
				287	{ .mmi
				288	(p_yy) cmp.le.unc p_y, p0 = 8, cnt
				289	add ptr3 = ptr1, cnt /* prepare last store */
				290	movi0 ar.lc = save_lc
				291	} { .mmi
				292	(p_yy) st2 [ptr1] = r0, 4 /* fill 2 (aligned) bytes */
				293	(p_yy) st2 [ptr2] = r0, 4 /* fill 2 (aligned) bytes */
				294	(p_yy) add cnt = -4, cnt /* [11, 10 (o less) left] */
				295	;; }
				296	{ .mmi
				297	(p_y) cmp.le.unc p_yy, p0 = 8, cnt
				298	add ptr3 = -1, ptr3 /* last store */
				299	tbit.nz p_scr, p0 = cnt, 1 /* will there be a st2 at the end ? */
				300	} { .mmi
				301	(p_y) st2 [ptr1] = r0, 4 /* fill 2 (aligned) bytes */
				302	(p_y) st2 [ptr2] = r0, 4 /* fill 2 (aligned) bytes */
				303	(p_y) add cnt = -4, cnt /* [7, 6 (or less) left] */
				304	;; }
				305	{ .mmi
				306	(p_yy) st2 [ptr1] = r0, 4 /* fill 2 (aligned) bytes */
				307	(p_yy) st2 [ptr2] = r0, 4 /* fill 2 (aligned) bytes */
				308	/* [3, 2 (or less) left] */
				309	tbit.nz p_y, p0 = cnt, 0 /* will there be a st1 at the end ? */
				310	} { .mmi
				311	(p_yy) add cnt = -4, cnt
				312	;; }
				313	{ .mmb
				314	(p_scr) st2 [ptr1] = r0 /* fill 2 (aligned) bytes */
				315	(p_y) st1 [ptr3] = r0 /* fill last byte (using ptr3) */
				316	br.ret.sptk.many rp
				317	;; }
				318	END(bzero)
				319
				320	#endif