Blame - ap/libc/glibc/glibc-2.23/sysdeps/alpha/stxncpy.S - T106_DC

blob: c5d997ab2220b9faf501ca515d76063d13933624 [file] [log] [blame]

xf.li	bdd93d5	2023-05-12 07:10:14 -0700	[diff] [blame]	1	/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
				2	Contributed by Richard Henderson (rth@tamu.edu)
				3	This file is part of the GNU C Library.
				4
				5	The GNU C Library is free software; you can redistribute it and/or
				6	modify it under the terms of the GNU Lesser General Public
				7	License as published by the Free Software Foundation; either
				8	version 2.1 of the License, or (at your option) any later version.
				9
				10	The GNU C Library is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	Lesser General Public License for more details.
				14
				15	You should have received a copy of the GNU Lesser General Public
				16	License along with the GNU C Library. If not, see
				17	<http://www.gnu.org/licenses/>. */
				18
				19	/* Copy no more than COUNT bytes of the null-terminated string from
				20	SRC to DST.
				21
				22	This is an internal routine used by strncpy, stpncpy, and strncat.
				23	As such, it uses special linkage conventions to make implementation
				24	of these public functions more efficient.
				25
				26	On input:
				27	t9 = return address
				28	a0 = DST
				29	a1 = SRC
				30	a2 = COUNT
				31
				32	Furthermore, COUNT may not be zero.
				33
				34	On output:
				35	t0 = last word written
				36	t8 = bitmask (with one bit set) indicating the last byte written
				37	t10 = bitmask (with one bit set) indicating the byte position of
				38	the end of the range specified by COUNT
				39	a0 = unaligned address of the last word written
				40	a2 = the number of full words left in COUNT
				41
				42	Furthermore, v0, a3-a5, t11, and t12 are untouched.
				43	*/
				44
				45
				46	/* This is generally scheduled for the EV5, but should still be pretty
				47	good for the EV4 too. */
				48
				49	#include <sysdep.h>
				50
				51	.set noat
				52	.set noreorder
				53
				54	.text
				55	.type __stxncpy, @function
				56	.globl __stxncpy
				57	.usepv __stxncpy, no
				58
				59	cfi_startproc
				60	cfi_return_column (t9)
				61
				62	/* On entry to this basic block:
				63	t0 == the first destination word for masking back in
				64	t1 == the first source word. */
				65	.align 3
				66	stxncpy_aligned:
				67	/* Create the 1st output word and detect 0's in the 1st input word. */
				68	lda t2, -1 # e1 : build a mask against false zero
				69	mskqh t2, a1, t2 # e0 : detection in the src word
				70	mskqh t1, a1, t3 # e0 :
				71	ornot t1, t2, t2 # .. e1 :
				72	mskql t0, a1, t0 # e0 : assemble the first output word
				73	cmpbge zero, t2, t7 # .. e1 : bits set iff null found
				74	or t0, t3, t0 # e0 :
				75	beq a2, $a_eoc # .. e1 :
				76	bne t7, $a_eos # .. e1 :
				77
				78	/* On entry to this basic block:
				79	t0 == a source word not containing a null. */
				80	$a_loop:
				81	stq_u t0, 0(a0) # e0 :
				82	addq a0, 8, a0 # .. e1 :
				83	ldq_u t0, 0(a1) # e0 :
				84	addq a1, 8, a1 # .. e1 :
				85	subq a2, 1, a2 # e0 :
				86	cmpbge zero, t0, t7 # .. e1 (stall)
				87	beq a2, $a_eoc # e1 :
				88	beq t7, $a_loop # e1 :
				89
				90	/* Take care of the final (partial) word store. At this point
				91	the end-of-count bit is set in t7 iff it applies.
				92
				93	On entry to this basic block we have:
				94	t0 == the source word containing the null
				95	t7 == the cmpbge mask that found it. */
				96	$a_eos:
				97	negq t7, t8 # e0 : find low bit set
				98	and t7, t8, t8 # e1 (stall)
				99
				100	/* For the sake of the cache, don't read a destination word
				101	if we're not going to need it. */
				102	and t8, 0x80, t6 # e0 :
				103	bne t6, 1f # .. e1 (zdb)
				104
				105	/* We're doing a partial word store and so need to combine
				106	our source and original destination words. */
				107	ldq_u t1, 0(a0) # e0 :
				108	subq t8, 1, t6 # .. e1 :
				109	or t8, t6, t7 # e0 :
				110	unop #
				111	zapnot t0, t7, t0 # e0 : clear src bytes > null
				112	zap t1, t7, t1 # .. e1 : clear dst bytes <= null
				113	or t0, t1, t0 # e1 :
				114
				115	1: stq_u t0, 0(a0) # e0 :
				116	ret (t9) # e1 :
				117
				118	/* Add the end-of-count bit to the eos detection bitmask. */
				119	$a_eoc:
				120	or t10, t7, t7
				121	br $a_eos
				122
				123	.align 3
				124	__stxncpy:
				125	/* Are source and destination co-aligned? */
				126	lda t2, -1
				127	xor a0, a1, t1
				128	srl t2, 1, t2
				129	and a0, 7, t0 # find dest misalignment
				130	cmovlt a2, t2, a2 # bound neg count to LONG_MAX
				131	and t1, 7, t1
				132	addq a2, t0, a2 # bias count by dest misalignment
				133	subq a2, 1, a2
				134	and a2, 7, t2
				135	srl a2, 3, a2 # a2 = loop counter = (count - 1)/8
				136	addq zero, 1, t10
				137	sll t10, t2, t10 # t10 = bitmask of last count byte
				138	bne t1, $unaligned
				139
				140	/* We are co-aligned; take care of a partial first word. */
				141
				142	ldq_u t1, 0(a1) # e0 : load first src word
				143	addq a1, 8, a1 # .. e1 :
				144
				145	beq t0, stxncpy_aligned # avoid loading dest word if not needed
				146	ldq_u t0, 0(a0) # e0 :
				147	br stxncpy_aligned # .. e1 :
				148
				149
				150	/* The source and destination are not co-aligned. Align the destination
				151	and cope. We have to be very careful about not reading too much and
				152	causing a SEGV. */
				153
				154	.align 3
				155	$u_head:
				156	/* We know just enough now to be able to assemble the first
				157	full source word. We can still find a zero at the end of it
				158	that prevents us from outputting the whole thing.
				159
				160	On entry to this basic block:
				161	t0 == the first dest word, unmasked
				162	t1 == the shifted low bits of the first source word
				163	t6 == bytemask that is -1 in dest word bytes */
				164
				165	ldq_u t2, 8(a1) # e0 : load second src word
				166	addq a1, 8, a1 # .. e1 :
				167	mskql t0, a0, t0 # e0 : mask trailing garbage in dst
				168	extqh t2, a1, t4 # e0 :
				169	or t1, t4, t1 # e1 : first aligned src word complete
				170	mskqh t1, a0, t1 # e0 : mask leading garbage in src
				171	or t0, t1, t0 # e0 : first output word complete
				172	or t0, t6, t6 # e1 : mask original data for zero test
				173	cmpbge zero, t6, t7 # e0 :
				174	beq a2, $u_eocfin # .. e1 :
				175	lda t6, -1 # e0 :
				176	bne t7, $u_final # .. e1 :
				177
				178	mskql t6, a1, t6 # e0 : mask out bits already seen
				179	nop # .. e1 :
				180	stq_u t0, 0(a0) # e0 : store first output word
				181	or t6, t2, t2 # .. e1 :
				182	cmpbge zero, t2, t7 # e0 : find nulls in second partial
				183	addq a0, 8, a0 # .. e1 :
				184	subq a2, 1, a2 # e0 :
				185	bne t7, $u_late_head_exit # .. e1 :
				186
				187	/* Finally, we've got all the stupid leading edge cases taken care
				188	of and we can set up to enter the main loop. */
				189
				190	extql t2, a1, t1 # e0 : position hi-bits of lo word
				191	beq a2, $u_eoc # .. e1 :
				192	ldq_u t2, 8(a1) # e0 : read next high-order source word
				193	addq a1, 8, a1 # .. e1 :
				194	extqh t2, a1, t0 # e0 : position lo-bits of hi word
				195	cmpbge zero, t2, t7 # .. e1 : test new word for eos
				196	nop # e0 :
				197	bne t7, $u_eos # .. e1 :
				198
				199	/* Unaligned copy main loop. In order to avoid reading too much,
				200	the loop is structured to detect zeros in aligned source words.
				201	This has, unfortunately, effectively pulled half of a loop
				202	iteration out into the head and half into the tail, but it does
				203	prevent nastiness from accumulating in the very thing we want
				204	to run as fast as possible.
				205
				206	On entry to this basic block:
				207	t0 == the shifted low-order bits from the current source word
				208	t1 == the shifted high-order bits from the previous source word
				209	t2 == the unshifted current source word
				210
				211	We further know that t2 does not contain a null terminator. */
				212
				213	.align 3
				214	$u_loop:
				215	or t0, t1, t0 # e0 : current dst word now complete
				216	subq a2, 1, a2 # .. e1 : decrement word count
				217	stq_u t0, 0(a0) # e0 : save the current word
				218	addq a0, 8, a0 # .. e1 :
				219	extql t2, a1, t1 # e0 : extract high bits for next time
				220	beq a2, $u_eoc # .. e1 :
				221	ldq_u t2, 8(a1) # e0 : load high word for next time
				222	addq a1, 8, a1 # .. e1 :
				223	nop # e0 :
				224	cmpbge zero, t2, t7 # .. e1 : test new word for eos
				225	extqh t2, a1, t0 # e0 : extract low bits for current word
				226	beq t7, $u_loop # .. e1 :
				227
				228	/* We've found a zero somewhere in the source word we just read.
				229	If it resides in the lower half, we have one (probably partial)
				230	word to write out, and if it resides in the upper half, we
				231	have one full and one partial word left to write out.
				232
				233	On entry to this basic block:
				234	t0 == the shifted low-order bits from the current source word
				235	t1 == the shifted high-order bits from the previous source word
				236	t2 == the unshifted current source word. */
				237	$u_eos:
				238	or t0, t1, t0 # e0 : first (partial) source word complete
				239	cmpbge zero, t0, t7 # e0 : is the null in this first bit?
				240	bne t7, $u_final # .. e1 (zdb)
				241
				242	stq_u t0, 0(a0) # e0 : the null was in the high-order bits
				243	addq a0, 8, a0 # .. e1 :
				244	subq a2, 1, a2 # e0 :
				245
				246	$u_late_head_exit:
				247	extql t2, a1, t0 # e0 :
				248	cmpbge zero, t0, t7 # e0 :
				249	or t7, t10, t6 # e1 :
				250	cmoveq a2, t6, t7 # e0 :
				251
				252	/* Take care of a final (probably partial) result word.
				253	On entry to this basic block:
				254	t0 == assembled source word
				255	t7 == cmpbge mask that found the null. */
				256	$u_final:
				257	negq t7, t6 # e0 : isolate low bit set
				258	and t6, t7, t8 # e1 :
				259
				260	and t8, 0x80, t6 # e0 : avoid dest word load if we can
				261	bne t6, 1f # .. e1 (zdb)
				262
				263	ldq_u t1, 0(a0) # e0 :
				264	subq t8, 1, t6 # .. e1 :
				265	or t6, t8, t7 # e0 :
				266	zapnot t0, t7, t0 # .. e1 : kill source bytes > null
				267	zap t1, t7, t1 # e0 : kill dest bytes <= null
				268	or t0, t1, t0 # e1 :
				269
				270	1: stq_u t0, 0(a0) # e0 :
				271	ret (t9) # .. e1 :
				272
				273	/* Got to end-of-count before end of string.
				274	On entry to this basic block:
				275	t1 == the shifted high-order bits from the previous source word */
				276	$u_eoc:
				277	and a1, 7, t6 # e1 :
				278	sll t10, t6, t6 # e0 :
				279	and t6, 0xff, t6 # e0 :
				280	bne t6, 1f # e1 : avoid src word load if we can
				281
				282	ldq_u t2, 8(a1) # e0 : load final src word
				283	nop # .. e1 :
				284	extqh t2, a1, t0 # e0 : extract high bits for last word
				285	or t1, t0, t1 # e1 :
				286
				287	1: cmpbge zero, t1, t7
				288	mov t1, t0
				289
				290	$u_eocfin: # end-of-count, final word
				291	or t10, t7, t7
				292	br $u_final
				293
				294	/* Unaligned copy entry point. */
				295	.align 3
				296	$unaligned:
				297
				298	ldq_u t1, 0(a1) # e0 : load first source word
				299
				300	and a0, 7, t4 # .. e1 : find dest misalignment
				301	and a1, 7, t5 # e0 : find src misalignment
				302
				303	/* Conditionally load the first destination word and a bytemask
				304	with 0xff indicating that the destination byte is sacrosanct. */
				305
				306	mov zero, t0 # .. e1 :
				307	mov zero, t6 # e0 :
				308	beq t4, 1f # .. e1 :
				309	ldq_u t0, 0(a0) # e0 :
				310	lda t6, -1 # .. e1 :
				311	mskql t6, a0, t6 # e0 :
				312	1:
				313	subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
				314
				315	/* If source misalignment is larger than dest misalignment, we need
				316	extra startup checks to avoid SEGV. */
				317
				318	cmplt t4, t5, t8 # e1 :
				319	extql t1, a1, t1 # .. e0 : shift src into place
				320	lda t2, -1 # e0 : for creating masks later
				321	beq t8, $u_head # e1 :
				322
				323	mskqh t2, t5, t2 # e0 : begin src byte validity mask
				324	cmpbge zero, t1, t7 # .. e1 : is there a zero?
				325	extql t2, a1, t2 # e0 :
				326	or t7, t10, t5 # .. e1 : test for end-of-count too
				327	cmpbge zero, t2, t3 # e0 :
				328	cmoveq a2, t5, t7 # .. e1 :
				329	andnot t7, t3, t7 # e0 :
				330	beq t7, $u_head # .. e1 (zdb)
				331
				332	/* At this point we've found a zero in the first partial word of
				333	the source. We need to isolate the valid source data and mask
				334	it into the original destination data. (Incidentally, we know
				335	that we'll need at least one byte of that original dest word.) */
				336
				337	ldq_u t0, 0(a0) # e0 :
				338	negq t7, t6 # .. e1 : build bitmask of bytes <= zero
				339	mskqh t1, t4, t1 # e0 :
				340	and t6, t7, t8 # .. e1 :
				341	subq t8, 1, t6 # e0 :
				342	or t6, t8, t7 # e1 :
				343
				344	zapnot t2, t7, t2 # e0 : prepare source word; mirror changes
				345	zapnot t1, t7, t1 # .. e1 : to source validity mask
				346
				347	andnot t0, t2, t0 # e0 : zero place for source to reside
				348	or t0, t1, t0 # e1 : and put it there
				349	stq_u t0, 0(a0) # e0 :
				350	ret (t9) # .. e1 :
				351
				352	cfi_endproc