Blame - src/kernel/linux/v4.19/arch/alpha/lib/ev6-memcpy.S - T800

blob: ffbd056b6eb2905d72d01b6f5bc65d9ac4a06340 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0 */
				2	/*
				3	* arch/alpha/lib/ev6-memcpy.S
				4	* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
				5	*
				6	* Reasonably optimized memcpy() routine for the Alpha 21264
				7	*
				8	* - memory accessed as aligned quadwords only
				9	* - uses bcmpge to compare 8 bytes in parallel
				10	*
				11	* Much of the information about 21264 scheduling/coding comes from:
				12	* Compiler Writer's Guide for the Alpha 21264
				13	* abbreviated as 'CWG' in other comments here
				14	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				15	* Scheduling notation:
				16	* E - either cluster
				17	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				18	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				19	*
				20	* Temp usage notes:
				21	* $1,$2, - scratch
				22	*/
				23	#include <asm/export.h>
				24	.set noreorder
				25	.set noat
				26
				27	.align 4
				28	.globl memcpy
				29	.ent memcpy
				30	memcpy:
				31	.frame $30,0,$26,0
				32	.prologue 0
				33
				34	mov $16, $0 # E : copy dest to return
				35	ble $18, $nomoredata # U : done with the copy?
				36	xor $16, $17, $1 # E : are source and dest alignments the same?
				37	and $1, 7, $1 # E : are they the same mod 8?
				38
				39	bne $1, $misaligned # U : Nope - gotta do this the slow way
				40	/* source and dest are same mod 8 address */
				41	and $16, 7, $1 # E : Are both 0mod8?
				42	beq $1, $both_0mod8 # U : Yes
				43	nop # E :
				44
				45	/*
				46	* source and dest are same misalignment. move a byte at a time
				47	* until a 0mod8 alignment for both is reached.
				48	* At least one byte more to move
				49	*/
				50
				51	$head_align:
				52	ldbu $1, 0($17) # L : grab a byte
				53	subq $18, 1, $18 # E : count--
				54	addq $17, 1, $17 # E : src++
				55	stb $1, 0($16) # L :
				56	addq $16, 1, $16 # E : dest++
				57	and $16, 7, $1 # E : Are we at 0mod8 yet?
				58	ble $18, $nomoredata # U : done with the copy?
				59	bne $1, $head_align # U :
				60
				61	$both_0mod8:
				62	cmple $18, 127, $1 # E : Can we unroll the loop?
				63	bne $1, $no_unroll # U :
				64	and $16, 63, $1 # E : get mod64 alignment
				65	beq $1, $do_unroll # U : no single quads to fiddle
				66
				67	$single_head_quad:
				68	ldq $1, 0($17) # L : get 8 bytes
				69	subq $18, 8, $18 # E : count -= 8
				70	addq $17, 8, $17 # E : src += 8
				71	nop # E :
				72
				73	stq $1, 0($16) # L : store
				74	addq $16, 8, $16 # E : dest += 8
				75	and $16, 63, $1 # E : get mod64 alignment
				76	bne $1, $single_head_quad # U : still not fully aligned
				77
				78	$do_unroll:
				79	addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
				80	cmple $18, 127, $1 # E : Can we go through the unrolled loop?
				81	bne $1, $tail_quads # U : Nope
				82	nop # E :
				83
				84	$unroll_body:
				85	wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
				86	# ($7) are about to be over-written
				87	ldq $6, 0($17) # L0 : bytes 0..7
				88	nop # E :
				89	nop # E :
				90
				91	ldq $4, 8($17) # L : bytes 8..15
				92	ldq $5, 16($17) # L : bytes 16..23
				93	addq $7, 64, $7 # E : Update next wh64 address
				94	nop # E :
				95
				96	ldq $3, 24($17) # L : bytes 24..31
				97	addq $16, 64, $1 # E : fallback value for wh64
				98	nop # E :
				99	nop # E :
				100
				101	addq $17, 32, $17 # E : src += 32 bytes
				102	stq $6, 0($16) # L : bytes 0..7
				103	nop # E :
				104	nop # E :
				105
				106	stq $4, 8($16) # L : bytes 8..15
				107	stq $5, 16($16) # L : bytes 16..23
				108	subq $18, 192, $2 # E : At least two more trips to go?
				109	nop # E :
				110
				111	stq $3, 24($16) # L : bytes 24..31
				112	addq $16, 32, $16 # E : dest += 32 bytes
				113	nop # E :
				114	nop # E :
				115
				116	ldq $6, 0($17) # L : bytes 0..7
				117	ldq $4, 8($17) # L : bytes 8..15
				118	cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
				119	# fallback wh64 address if < 2 more trips
				120	nop # E :
				121
				122	ldq $5, 16($17) # L : bytes 16..23
				123	ldq $3, 24($17) # L : bytes 24..31
				124	addq $16, 32, $16 # E : dest += 32
				125	subq $18, 64, $18 # E : count -= 64
				126
				127	addq $17, 32, $17 # E : src += 32
				128	stq $6, -32($16) # L : bytes 0..7
				129	stq $4, -24($16) # L : bytes 8..15
				130	cmple $18, 63, $1 # E : At least one more trip?
				131
				132	stq $5, -16($16) # L : bytes 16..23
				133	stq $3, -8($16) # L : bytes 24..31
				134	nop # E :
				135	beq $1, $unroll_body
				136
				137	$tail_quads:
				138	$no_unroll:
				139	.align 4
				140	subq $18, 8, $18 # E : At least a quad left?
				141	blt $18, $less_than_8 # U : Nope
				142	nop # E :
				143	nop # E :
				144
				145	$move_a_quad:
				146	ldq $1, 0($17) # L : fetch 8
				147	subq $18, 8, $18 # E : count -= 8
				148	addq $17, 8, $17 # E : src += 8
				149	nop # E :
				150
				151	stq $1, 0($16) # L : store 8
				152	addq $16, 8, $16 # E : dest += 8
				153	bge $18, $move_a_quad # U :
				154	nop # E :
				155
				156	$less_than_8:
				157	.align 4
				158	addq $18, 8, $18 # E : add back for trailing bytes
				159	ble $18, $nomoredata # U : All-done
				160	nop # E :
				161	nop # E :
				162
				163	/* Trailing bytes */
				164	$tail_bytes:
				165	subq $18, 1, $18 # E : count--
				166	ldbu $1, 0($17) # L : fetch a byte
				167	addq $17, 1, $17 # E : src++
				168	nop # E :
				169
				170	stb $1, 0($16) # L : store a byte
				171	addq $16, 1, $16 # E : dest++
				172	bgt $18, $tail_bytes # U : more to be done?
				173	nop # E :
				174
				175	/* branching to exit takes 3 extra cycles, so replicate exit here */
				176	ret $31, ($26), 1 # L0 :
				177	nop # E :
				178	nop # E :
				179	nop # E :
				180
				181	$misaligned:
				182	mov $0, $4 # E : dest temp
				183	and $0, 7, $1 # E : dest alignment mod8
				184	beq $1, $dest_0mod8 # U : life doesnt totally suck
				185	nop
				186
				187	$aligndest:
				188	ble $18, $nomoredata # U :
				189	ldbu $1, 0($17) # L : fetch a byte
				190	subq $18, 1, $18 # E : count--
				191	addq $17, 1, $17 # E : src++
				192
				193	stb $1, 0($4) # L : store it
				194	addq $4, 1, $4 # E : dest++
				195	and $4, 7, $1 # E : dest 0mod8 yet?
				196	bne $1, $aligndest # U : go until we are aligned.
				197
				198	/* Source has unknown alignment, but dest is known to be 0mod8 */
				199	$dest_0mod8:
				200	subq $18, 8, $18 # E : At least a quad left?
				201	blt $18, $misalign_tail # U : Nope
				202	ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
				203	nop # E :
				204
				205	$mis_quad:
				206	ldq_u $16, 8($17) # L : Fetch next 8
				207	extql $3, $17, $3 # U : masking
				208	extqh $16, $17, $1 # U : masking
				209	bis $3, $1, $1 # E : merged bytes to store
				210
				211	subq $18, 8, $18 # E : count -= 8
				212	addq $17, 8, $17 # E : src += 8
				213	stq $1, 0($4) # L : store 8 (aligned)
				214	mov $16, $3 # E : "rotate" source data
				215
				216	addq $4, 8, $4 # E : dest += 8
				217	bge $18, $mis_quad # U : More quads to move
				218	nop
				219	nop
				220
				221	$misalign_tail:
				222	addq $18, 8, $18 # E : account for tail stuff
				223	ble $18, $nomoredata # U :
				224	nop
				225	nop
				226
				227	$misalign_byte:
				228	ldbu $1, 0($17) # L : fetch 1
				229	subq $18, 1, $18 # E : count--
				230	addq $17, 1, $17 # E : src++
				231	nop # E :
				232
				233	stb $1, 0($4) # L : store
				234	addq $4, 1, $4 # E : dest++
				235	bgt $18, $misalign_byte # U : more to go?
				236	nop
				237
				238
				239	$nomoredata:
				240	ret $31, ($26), 1 # L0 :
				241	nop # E :
				242	nop # E :
				243	nop # E :
				244
				245	.end memcpy
				246	EXPORT_SYMBOL(memcpy)
				247
				248	/* For backwards module compatibility. */
				249	__memcpy = memcpy
				250	.globl __memcpy