Blame - src/kernel/linux/v4.14/arch/alpha/lib/ev6-memchr.S - T103

blob: 56bf9e14eeeefadf510cbe4d52fba27c0f1f5701 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/* SPDX-License-Identifier: GPL-2.0 */
				2	/*
				3	* arch/alpha/lib/ev6-memchr.S
				4	*
				5	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
				6	*
				7	* Finds characters in a memory area. Optimized for the Alpha:
				8	*
				9	* - memory accessed as aligned quadwords only
				10	* - uses cmpbge to compare 8 bytes in parallel
				11	* - does binary search to find 0 byte in last
				12	* quadword (HAKMEM needed 12 instructions to
				13	* do this instead of the 9 instructions that
				14	* binary search needs).
				15	*
				16	* For correctness consider that:
				17	*
				18	* - only minimum number of quadwords may be accessed
				19	* - the third argument is an unsigned long
				20	*
				21	* Much of the information about 21264 scheduling/coding comes from:
				22	* Compiler Writer's Guide for the Alpha 21264
				23	* abbreviated as 'CWG' in other comments here
				24	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				25	* Scheduling notation:
				26	* E - either cluster
				27	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				28	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				29	* Try not to change the actual algorithm if possible for consistency.
				30	*/
				31	#include <asm/export.h>
				32	.set noreorder
				33	.set noat
				34
				35	.align 4
				36	.globl memchr
				37	.ent memchr
				38	memchr:
				39	.frame $30,0,$26,0
				40	.prologue 0
				41
				42	# Hack -- if someone passes in (size_t)-1, hoping to just
				43	# search til the end of the address space, we will overflow
				44	# below when we find the address of the last byte. Given
				45	# that we will never have a 56-bit address space, cropping
				46	# the length is the easiest way to avoid trouble.
				47	zap $18, 0x80, $5 # U : Bound length
				48	beq $18, $not_found # U :
				49	ldq_u $1, 0($16) # L : load first quadword Latency=3
				50	and $17, 0xff, $17 # E : L L U U : 00000000000000ch
				51
				52	insbl $17, 1, $2 # U : 000000000000ch00
				53	cmpult $18, 9, $4 # E : small (< 1 quad) string?
				54	or $2, $17, $17 # E : 000000000000chch
				55	lda $3, -1($31) # E : U L L U
				56
				57	sll $17, 16, $2 # U : 00000000chch0000
				58	addq $16, $5, $5 # E : Max search address
				59	or $2, $17, $17 # E : 00000000chchchch
				60	sll $17, 32, $2 # U : U L L U : chchchch00000000
				61
				62	or $2, $17, $17 # E : chchchchchchchch
				63	extql $1, $16, $7 # U : $7 is upper bits
				64	beq $4, $first_quad # U :
				65	ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
				66
				67	extqh $6, $16, $6 # U : 2 cycle stall for $6
				68	mov $16, $0 # E :
				69	nop # E :
				70	or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
				71
				72	# Deal with the case where at most 8 bytes remain to be searched
				73	# in $1. E.g.:
				74	# $18 = 6
				75	# $1 = ????c6c5c4c3c2c1
				76	$last_quad:
				77	negq $18, $6 # E :
				78	xor $17, $1, $1 # E :
				79	srl $3, $6, $6 # U : $6 = mask of $18 bits set
				80	cmpbge $31, $1, $2 # E : L U L U
				81
				82	nop
				83	nop
				84	and $2, $6, $2 # E :
				85	beq $2, $not_found # U : U L U L
				86
				87	$found_it:
				88	#ifdef CONFIG_ALPHA_EV67
				89	/*
				90	* Since we are guaranteed to have set one of the bits, we don't
				91	* have to worry about coming back with a 0x40 out of cttz...
				92	*/
				93	cttz $2, $3 # U0 :
				94	addq $0, $3, $0 # E : All done
				95	nop # E :
				96	ret # L0 : L U L U
				97	#else
				98	/*
				99	* Slow and clunky. It can probably be improved.
				100	* An exercise left for others.
				101	*/
				102	negq $2, $3 # E :
				103	and $2, $3, $2 # E :
				104	and $2, 0x0f, $1 # E :
				105	addq $0, 4, $3 # E :
				106
				107	cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
				108	nop # E : keep with cmov
				109	and $2, 0x33, $1 # E :
				110	addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
				111
				112	cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
				113	nop # E : keep with cmov
				114	and $2, 0x55, $1 # E :
				115	addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
				116
				117	cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
				118	nop
				119	nop
				120	ret # L0 : L U L U
				121	#endif
				122
				123	# Deal with the case where $18 > 8 bytes remain to be
				124	# searched. $16 may not be aligned.
				125	.align 4
				126	$first_quad:
				127	andnot $16, 0x7, $0 # E :
				128	insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
				129	xor $1, $17, $1 # E :
				130	or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
				131
				132	cmpbge $31, $1, $2 # E :
				133	bne $2, $found_it # U :
				134	# At least one byte left to process.
				135	ldq $1, 8($0) # L :
				136	subq $5, 1, $18 # E : U L U L
				137
				138	addq $0, 8, $0 # E :
				139	# Make $18 point to last quad to be accessed (the
				140	# last quad may or may not be partial).
				141	andnot $18, 0x7, $18 # E :
				142	cmpult $0, $18, $2 # E :
				143	beq $2, $final # U : U L U L
				144
				145	# At least two quads remain to be accessed.
				146
				147	subq $18, $0, $4 # E : $4 <- nr quads to be processed
				148	and $4, 8, $4 # E : odd number of quads?
				149	bne $4, $odd_quad_count # U :
				150	# At least three quads remain to be accessed
				151	mov $1, $4 # E : L U L U : move prefetched value to correct reg
				152
				153	.align 4
				154	$unrolled_loop:
				155	ldq $1, 8($0) # L : prefetch $1
				156	xor $17, $4, $2 # E :
				157	cmpbge $31, $2, $2 # E :
				158	bne $2, $found_it # U : U L U L
				159
				160	addq $0, 8, $0 # E :
				161	nop # E :
				162	nop # E :
				163	nop # E :
				164
				165	$odd_quad_count:
				166	xor $17, $1, $2 # E :
				167	ldq $4, 8($0) # L : prefetch $4
				168	cmpbge $31, $2, $2 # E :
				169	addq $0, 8, $6 # E :
				170
				171	bne $2, $found_it # U :
				172	cmpult $6, $18, $6 # E :
				173	addq $0, 8, $0 # E :
				174	nop # E :
				175
				176	bne $6, $unrolled_loop # U :
				177	mov $4, $1 # E : move prefetched value into $1
				178	nop # E :
				179	nop # E :
				180
				181	$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
				182	nop # E :
				183	nop # E :
				184	bne $18, $last_quad # U :
				185
				186	$not_found:
				187	mov $31, $0 # E :
				188	nop # E :
				189	nop # E :
				190	ret # L0 :
				191
				192	.end memchr
				193	EXPORT_SYMBOL(memchr)