Blame - ap/build/uClibc/libc/string/sh/sh4/memcpy.S - R306

blob: 6a229a06c7cdcb357e2baa75a6648edeac710459 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame^]	1	/*
				2	* "memcpy" implementation of SuperH
				3	*
				4	* Copyright (C) 1999 Niibe Yutaka
				5	* Copyright (c) 2002 STMicroelectronics Ltd
				6	* Modified from memcpy.S and micro-optimised for SH4
				7	* Stuart Menefy (stuart.menefy@st.com)
				8	*
				9	* Copyright (c) 2009 STMicroelectronics Ltd
				10	* Optimised using prefetching and 64bit data transfer via FPU
				11	* Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
				12	*/
				13
				14	/*
				15	* void memcpy(void dst, const void *src, size_t n);
				16	*
				17	* It is assumed that there is no overlap between src and dst.
				18	* If there is an overlap, then the results are undefined.
				19	*/
				20
				21	#include <sysdep.h>
				22	#include <endian.h>
				23
				24	#if defined (__LITTLE_ENDIAN__) && defined (__SH_FPU_ANY__)
				25	#define MEMCPY_USES_FPU
				26	/* Use paired single precision load or store mode for 64-bit tranfering.
				27	* FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
				28	* Currenlty it has been only implemented and tested for little endian mode. */
				29	.macro FPU_SET_PAIRED_PREC
				30	sts fpscr, r7
				31	mov #0x10, r0 ! PR=0 SZ=1
				32	shll16 r0
				33	lds r0, fpscr
				34	.endm
				35	.macro RESTORE_FPSCR
				36	lds r7, fpscr
				37	.endm
				38	.macro DALLOC
				39	! Cache allocate + store on dst-32.
				40	add #-32, r1
				41	movca.l r0, @r1
				42	add #32, r1
				43	.endm
				44
				45	#endif
				46
				47	!
				48	! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
				49	!
				50
				51	! Size is 16 or greater, and may have trailing bytes
				52
				53	.balign 32
				54	.Lcase1:
				55	! Read a long word and write a long word at once
				56	! At the start of each iteration, r7 contains last long load
				57	add #-1,r5 ! 79 EX
				58	mov r4,r2 ! 5 MT (0 cycles latency)
				59
				60	mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
				61	add #-4,r5 ! 50 EX
				62
				63	add #7,r2 ! 79 EX
				64	!
				65	#ifdef __LITTLE_ENDIAN__
				66	! 6 cycles, 4 bytes per iteration
				67	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
				68	mov r7, r3 ! 5 MT (latency=0) ! RQPO
				69
				70	cmp/hi r2,r0 ! 57 MT
				71	shll16 r3 ! 103 EX
				72
				73	mov r1,r6 ! 5 MT (latency=0)
				74	shll8 r3 ! 102 EX ! Oxxx
				75
				76	shlr8 r6 ! 106 EX ! xNML
				77	mov r1, r7 ! 5 MT (latency=0)
				78
				79	or r6,r3 ! 82 EX ! ONML
				80	bt/s 3b ! 109 BR
				81
				82	mov.l r3,@-r0 ! 30 LS
				83	#else
				84	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
				85	mov r7,r3 ! 5 MT (latency=0) ! OPQR
				86
				87	cmp/hi r2,r0 ! 57 MT
				88	shlr16 r3 ! 107 EX
				89
				90	shlr8 r3 ! 106 EX ! xxxO
				91	mov r1,r6 ! 5 MT (latency=0)
				92
				93	shll8 r6 ! 102 EX ! LMNx
				94	mov r1,r7 ! 5 MT (latency=0)
				95
				96	or r6,r3 ! 82 EX ! LMNO
				97	bt/s 3b ! 109 BR
				98
				99	mov.l r3,@-r0 ! 30 LS
				100	#endif
				101	! Finally, copy a byte at once, if necessary
				102
				103	add #4,r5 ! 50 EX
				104	cmp/eq r4,r0 ! 54 MT
				105
				106	add #-6,r2 ! 50 EX
				107	bt 9f ! 109 BR
				108
				109	8: cmp/hi r2,r0 ! 57 MT
				110	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				111
				112	bt/s 8b ! 109 BR
				113
				114	mov.b r1,@-r0 ! 29 LS
				115
				116	9: rts
				117	nop
				118
				119
				120	!
				121	! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
				122	!
				123
				124	! Size is 16 or greater, and may have trailing bytes
				125
				126	.balign 32
				127	.Lcase3:
				128	! Read a long word and write a long word at once
				129	! At the start of each iteration, r7 contains last long load
				130	add #-3,r5 ! 79 EX
				131	mov r4,r2 ! 5 MT (0 cycles latency)
				132
				133	mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
				134	add #-4,r5 ! 50 EX
				135
				136	add #7,r2 ! 79 EX
				137	!
				138	#ifdef __LITTLE_ENDIAN__
				139	! 6 cycles, 4 bytes per iteration
				140	3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
				141	mov r7, r3 ! 5 MT (latency=0) ! RQPO
				142
				143	cmp/hi r2,r0 ! 57 MT
				144	shll8 r3 ! 102 EX ! QPOx
				145
				146	mov r1,r6 ! 5 MT (latency=0)
				147	shlr16 r6 ! 107 EX
				148
				149	shlr8 r6 ! 106 EX ! xxxN
				150	mov r1, r7 ! 5 MT (latency=0)
				151
				152	or r6,r3 ! 82 EX ! QPON
				153	bt/s 3b ! 109 BR
				154
				155	mov.l r3,@-r0 ! 30 LS
				156	#else
				157	3: mov r7,r3 ! OPQR
				158	shlr8 r3 ! xOPQ
				159	mov.l @(r0,r5),r7 ! KLMN
				160	mov r7,r6
				161	shll16 r6
				162	shll8 r6 ! Nxxx
				163	or r6,r3 ! NOPQ
				164	cmp/hi r2,r0
				165	bt/s 3b
				166	mov.l r3,@-r0
				167	#endif
				168
				169	! Finally, copy a byte at once, if necessary
				170
				171	add #6,r5 ! 50 EX
				172	cmp/eq r4,r0 ! 54 MT
				173
				174	add #-6,r2 ! 50 EX
				175	bt 9f ! 109 BR
				176
				177	8: cmp/hi r2,r0 ! 57 MT
				178	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				179
				180	bt/s 8b ! 109 BR
				181
				182	mov.b r1,@-r0 ! 29 LS
				183
				184	9: rts
				185	nop
				186
				187	ENTRY(memcpy)
				188
				189	! Calculate the invariants which will be used in the remainder
				190	! of the code:
				191	!
				192	! r4 --> [ ... ] DST [ ... ] SRC
				193	! [ ... ] [ ... ]
				194	! : :
				195	! r0 --> [ ... ] r0+r5 --> [ ... ]
				196	!
				197	!
				198
				199	! Short circuit the common case of src, dst and len being 32 bit aligned
				200	! and test for zero length move
				201
				202	mov r6, r0 ! 5 MT (0 cycle latency)
				203	or r4, r0 ! 82 EX
				204
				205	or r5, r0 ! 82 EX
				206	tst r6, r6 ! 86 MT
				207
				208	bt/s 99f ! 111 BR (zero len)
				209	tst #3, r0 ! 87 MT
				210
				211	mov r4, r0 ! 5 MT (0 cycle latency)
				212	add r6, r0 ! 49 EX
				213
				214	bt/s .Lcase00 ! 111 BR (aligned)
				215	sub r4, r5 ! 75 EX
				216
				217	! Arguments are not nicely long word aligned or zero len.
				218	! Check for small copies, and if so do a simple byte at a time copy.
				219	!
				220	! Deciding on an exact value of 'small' is not easy, as the point at which
				221	! using the optimised routines become worthwhile varies (these are the
				222	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
				223	! size byte-at-time long word byte
				224	! 16 42 39-40 46-50 50-55
				225	! 24 58 43-44 54-58 62-67
				226	! 36 82 49-50 66-70 80-85
				227	! However the penalty for getting it 'wrong' is much higher for long word
				228	! aligned data (and this is more common), so use a value of 16.
				229
				230	mov #16, r1 ! 6 EX
				231	cmp/gt r6,r1 ! 56 MT
				232
				233	add #-1,r5 ! 50 EX
				234	bf/s 6f ! 108 BR (not small)
				235
				236	mov r5, r3 ! 5 MT (latency=0)
				237	shlr r6 ! 104 EX
				238
				239	mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				240	bf/s 4f ! 111 BR
				241
				242	add #-1,r3 ! 50 EX
				243	tst r6, r6 ! 86 MT
				244
				245	bt/s 98f ! 110 BR
				246	mov.b r1,@-r0 ! 29 LS
				247
				248	! 4 cycles, 2 bytes per iteration
				249	3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
				250
				251	4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
				252	dt r6 ! 67 EX
				253
				254	mov.b r1,@-r0 ! 29 LS
				255	bf/s 3b ! 111 BR
				256
				257	mov.b r2,@-r0 ! 29 LS
				258	98:
				259	rts
				260	nop
				261
				262	99: rts
				263	mov r4, r0
				264
				265	! Size is not small, so its worthwhile looking for optimisations.
				266	! First align destination to a long word boundary.
				267	!
				268	! r5 = normal value -1
				269
				270	6: tst #3, r0 ! 87 MT
				271	mov #3, r3 ! 6 EX
				272
				273	bt/s 2f ! 111 BR
				274	and r0,r3 ! 78 EX
				275
				276	! 3 cycles, 1 byte per iteration
				277	1: dt r3 ! 67 EX
				278	mov.b @(r0,r5),r1 ! 19 LS (latency=2)
				279
				280	add #-1, r6 ! 79 EX
				281	bf/s 1b ! 109 BR
				282
				283	mov.b r1,@-r0 ! 28 LS
				284
				285	2: add #1, r5 ! 79 EX
				286
				287	! Now select the appropriate bulk transfer code based on relative
				288	! alignment of src and dst.
				289
				290	mov r0, r3 ! 5 MT (latency=0)
				291
				292	mov r5, r0 ! 5 MT (latency=0)
				293	tst #1, r0 ! 87 MT
				294
				295	bf/s 1f ! 111 BR
				296	mov #64, r7 ! 6 EX
				297
				298	! bit 0 clear
				299
				300	cmp/ge r7, r6 ! 55 MT
				301
				302	bt/s 2f ! 111 BR
				303	tst #2, r0 ! 87 MT
				304
				305	! small
				306	bt/s .Lcase0
				307	mov r3, r0
				308
				309	bra .Lcase2
				310	nop
				311
				312	! big
				313	2: bt/s .Lcase0b
				314	mov r3, r0
				315
				316	bra .Lcase2b
				317	nop
				318
				319	! bit 0 set
				320	1: tst #2, r0 ! 87 MT
				321
				322	bt/s .Lcase1
				323	mov r3, r0
				324
				325	bra .Lcase3
				326	nop
				327
				328
				329	!
				330	! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
				331	!
				332
				333	! src, dst and size are all long word aligned
				334	! size is non-zero
				335
				336	.balign 32
				337	.Lcase00:
				338	mov #64, r1 ! 6 EX
				339	mov r5, r3 ! 5 MT (latency=0)
				340
				341	cmp/gt r6, r1 ! 56 MT
				342	add #-4, r5 ! 50 EX
				343
				344	bf .Lcase00b ! 108 BR (big loop)
				345	shlr2 r6 ! 105 EX
				346
				347	shlr r6 ! 104 EX
				348	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				349
				350	bf/s 4f ! 111 BR
				351	add #-8, r3 ! 50 EX
				352
				353	tst r6, r6 ! 86 MT
				354	bt/s 5f ! 110 BR
				355
				356	mov.l r1,@-r0 ! 30 LS
				357
				358	! 4 cycles, 2 long words per iteration
				359	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				360
				361	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
				362	dt r6 ! 67 EX
				363
				364	mov.l r1, @-r0 ! 30 LS
				365	bf/s 3b ! 109 BR
				366
				367	mov.l r2, @-r0 ! 30 LS
				368
				369	5: rts
				370	nop
				371
				372
				373	! Size is 16 or greater and less than 64, but may have trailing bytes
				374
				375	.balign 32
				376	.Lcase0:
				377	add #-4, r5 ! 50 EX
				378	mov r4, r7 ! 5 MT (latency=0)
				379
				380	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				381	mov #4, r2 ! 6 EX
				382
				383	add #11, r7 ! 50 EX
				384	tst r2, r6 ! 86 MT
				385
				386	mov r5, r3 ! 5 MT (latency=0)
				387	bt/s 4f ! 111 BR
				388
				389	add #-4, r3 ! 50 EX
				390	mov.l r1,@-r0 ! 30 LS
				391
				392	! 4 cycles, 2 long words per iteration
				393	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				394
				395	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
				396	cmp/hi r7, r0
				397
				398	mov.l r1, @-r0 ! 30 LS
				399	bt/s 3b ! 109 BR
				400
				401	mov.l r2, @-r0 ! 30 LS
				402
				403	! Copy the final 0-3 bytes
				404
				405	add #3,r5 ! 50 EX
				406
				407	cmp/eq r0, r4 ! 54 MT
				408	add #-10, r7 ! 50 EX
				409
				410	bt 9f ! 110 BR
				411
				412	! 3 cycles, 1 byte per iteration
				413	1: mov.b @(r0,r5),r1 ! 19 LS
				414	cmp/hi r7,r0 ! 57 MT
				415
				416	bt/s 1b ! 111 BR
				417	mov.b r1,@-r0 ! 28 LS
				418
				419	9: rts
				420	nop
				421
				422	! Size is at least 64 bytes, so will be going round the big loop at least once.
				423	!
				424	! r2 = rounded up r4
				425	! r3 = rounded down r0
				426
				427	.balign 32
				428	.Lcase0b:
				429	add #-4, r5 ! 50 EX
				430
				431	.Lcase00b:
				432	mov r0, r3 ! 5 MT (latency=0)
				433	mov #(~0x1f), r1 ! 6 EX
				434
				435	and r1, r3 ! 78 EX
				436	mov r4, r2 ! 5 MT (latency=0)
				437
				438	cmp/eq r3, r0 ! 54 MT
				439	add #0x1f, r2 ! 50 EX
				440
				441	bt/s 1f ! 110 BR
				442	and r1, r2 ! 78 EX
				443
				444	! copy initial words until cache line aligned
				445
				446	mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				447	tst #4, r0 ! 87 MT
				448
				449	mov r5, r6 ! 5 MT (latency=0)
				450	add #-4, r6 ! 50 EX
				451
				452	bt/s 4f ! 111 BR
				453	add #8, r3 ! 50 EX
				454
				455	tst #0x18, r0 ! 87 MT
				456
				457	bt/s 1f ! 109 BR
				458	mov.l r1,@-r0 ! 30 LS
				459
				460	! 4 cycles, 2 long words per iteration
				461	3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
				462
				463	4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
				464	cmp/eq r3, r0 ! 54 MT
				465
				466	mov.l r1, @-r0 ! 30 LS
				467	bf/s 3b ! 109 BR
				468
				469	mov.l r7, @-r0 ! 30 LS
				470
				471	#ifdef MEMCPY_USES_FPU
				472	! Copy the cache line aligned blocks by using the FPU registers.
				473	! If src and dst are well aligned adopt 64-bit data transfer.
				474	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
				475	! r5: src (was r0+r5)
				476	! r1: dest (was r0)
				477	1:
				478	add r0, r5
				479	mov r0, r1
				480
				481	mov r1, r3 ! MT
				482	sub r2, r3 ! EX (r3 - r2 -> r3)
				483	mov #-5, r0
				484	shld r0, r3 ! number of the cache lines
				485
				486	mov #8, r0
				487	cmp/ge r0, r3 ! Check if there are many cache lines to copy.
				488	bf 45f ! Copy cache line aligned blocks without pref.
				489	mov r5, r0
				490	add #-0x7c, r0
				491	tst #7, r0 ! src is 8byte aligned
				492	bf 45f
				493
				494	! Many cache lines have to be copied and the buffers are well aligned.
				495	! Aggressive prefetching and FPU in single paired precision.
				496	mov r0, r5
				497	mov r5, r6
				498	add #-0x80, r6 ! prefetch head
				499
				500	! store FPU (in single precision mode, do not check R15 align).
				501	fmov fr12, @-r15
				502	fmov fr13, @-r15
				503	fmov fr14, @-r15
				504	fmov fr15, @-r15
				505
				506	FPU_SET_PAIRED_PREC
				507
				508	mov #4, r0
				509	67:
				510	add #-0x20, r6
				511	pref @r6
				512	add #-0x20, r6
				513	pref @r6
				514
				515	fmov @r5+, dr0
				516	fmov @r5+, dr2
				517	fmov @r5+, dr4
				518	fmov @r5+, dr6
				519	fmov @r5+, dr8
				520	fmov @r5+, dr10
				521	fmov @r5+, dr12
				522	fmov @r5+, dr14
				523	fmov @r5+, xd0
				524	fmov @r5+, xd2
				525	fmov @r5+, xd4
				526	fmov @r5+, xd6
				527	fmov @r5+, xd8
				528	fmov @r5+, xd10
				529	fmov @r5+, xd12
				530	fmov @r5+, xd14
				531
				532	DALLOC
				533	fmov xd14, @-r1
				534	fmov xd12, @-r1
				535	fmov xd10, @-r1
				536	fmov xd8, @-r1
				537	DALLOC
				538	fmov xd6, @-r1
				539	fmov xd4, @-r1
				540	fmov xd2, @-r1
				541	fmov xd0, @-r1
				542	DALLOC
				543	fmov dr14, @-r1
				544	fmov dr12, @-r1
				545	fmov dr10, @-r1
				546	fmov dr8, @-r1
				547	DALLOC
				548	fmov dr6, @-r1
				549	add #-0x80, r5
				550	fmov dr4, @-r1
				551	add #-0x80, r5
				552	fmov dr2, @-r1
				553	add #-0x20, r6
				554	fmov dr0, @-r1
				555	add #-4, r3
				556	pref @r6
				557	add #-0x20, r6
				558	cmp/ge r0, r3
				559	bt/s 67b
				560	pref @r6
				561
				562	RESTORE_FPSCR
				563
				564	! Restore FPU callee save registers
				565	fmov @r15+, fr15
				566	fmov @r15+, fr14
				567	fmov @r15+, fr13
				568	fmov @r15+, fr12
				569
				570	! Other cache lines could be copied: so use the FPU in single paired
				571	! precision without prefetching. No check for alignment is necessary.
				572
				573	mov #1, r0
				574	cmp/ge r0, r3
				575	bt/s 3f
				576	add #0x60, r5
				577
				578	bra 5f
				579	nop
				580
				581	! No prefetch and FPU in single precision.
				582	45:
				583	add #-0x1c, r5
				584	mov r5, r0
				585	tst #7, r0
				586	bt 3f
				587
				588	2: fmov.s @r5+, fr0
				589	fmov.s @r5+, fr1
				590	fmov.s @r5+, fr2
				591	fmov.s @r5+, fr3
				592	fmov.s @r5+, fr4
				593	fmov.s @r5+, fr5
				594	fmov.s @r5+, fr6
				595	fmov.s @r5+, fr7
				596
				597	DALLOC
				598
				599	fmov.s fr7, @-r1
				600	fmov.s fr6, @-r1
				601	fmov.s fr5, @-r1
				602	fmov.s fr4, @-r1
				603	fmov.s fr3, @-r1
				604	fmov.s fr2, @-r1
				605	fmov.s fr1, @-r1
				606	fmov.s fr0, @-r1
				607
				608	cmp/eq r2,r1
				609
				610	bf/s 2b
				611	add #-0x40, r5
				612
				613	bra 5f
				614	nop
				615
				616	! No prefetch and FPU in single paired precision.
				617
				618	3: FPU_SET_PAIRED_PREC
				619
				620	4: fmov @r5+, dr0
				621	fmov @r5+, dr2
				622	fmov @r5+, dr4
				623	fmov @r5+, dr6
				624
				625	DALLOC
				626
				627	fmov dr6, @-r1
				628	fmov dr4, @-r1
				629	fmov dr2, @-r1
				630	fmov dr0, @-r1
				631	cmp/eq r2,r1
				632
				633	bf/s 4b
				634	add #-0x40, r5
				635
				636	RESTORE_FPSCR
				637
				638	5: mov r1, r0
				639
				640	cmp/eq r4, r0 ! 54 MT
				641	bf/s 1f ! 109 BR
				642	sub r1, r5 ! 75 EX
				643
				644	rts
				645	nop
				646	1:
				647	#else
				648	! Copy the cache line aligned blocks
				649	!
				650	! In use: r0, r2, r4, r5
				651	! Scratch: r1, r3, r6, r7
				652	!
				653	! We could do this with the four scratch registers, but if src
				654	! and dest hit the same cache line, this will thrash, so make
				655	! use of additional registers.
				656	!
				657	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
				658	! r5: src (was r0+r5)
				659	! r1: dest (was r0)
				660	! this can be reversed at the end, so we don't need to save any extra
				661	! state.
				662	!
				663	1: mov.l r8, @-r15 ! 30 LS
				664	add r0, r5 ! 49 EX
				665
				666	mov.l r9, @-r15 ! 30 LS
				667	mov r0, r1 ! 5 MT (latency=0)
				668
				669	mov.l r10, @-r15 ! 30 LS
				670	add #-0x1c, r5 ! 50 EX
				671
				672	mov.l r11, @-r15 ! 30 LS
				673
				674	! 16 cycles, 32 bytes per iteration
				675	2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
				676	add #-0x20, r1 ! 50 EX
				677	mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
				678	mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
				679	mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
				680	mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
				681	mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
				682	mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
				683	mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
				684	movca.l r0,@r1 ! 40 LS (latency=3-7)
				685	mov.l r3,@(0x04,r1) ! 33 LS
				686	mov.l r6,@(0x08,r1) ! 33 LS
				687	mov.l r7,@(0x0c,r1) ! 33 LS
				688
				689	mov.l r8,@(0x10,r1) ! 33 LS
				690	add #-0x20, r5 ! 50 EX
				691
				692	mov.l r9,@(0x14,r1) ! 33 LS
				693	cmp/eq r2,r1 ! 54 MT
				694
				695	mov.l r10,@(0x18,r1) ! 33 LS
				696	bf/s 2b ! 109 BR
				697
				698	mov.l r11,@(0x1c,r1) ! 33 LS
				699
				700	mov r1, r0 ! 5 MT (latency=0)
				701
				702	mov.l @r15+, r11 ! 15 LS
				703	sub r1, r5 ! 75 EX
				704
				705	mov.l @r15+, r10 ! 15 LS
				706	cmp/eq r4, r0 ! 54 MT
				707
				708	bf/s 1f ! 109 BR
				709	mov.l @r15+, r9 ! 15 LS
				710
				711	rts
				712	1: mov.l @r15+, r8 ! 15 LS
				713	#endif
				714	sub r4, r1 ! 75 EX (len remaining)
				715
				716	! number of trailing bytes is non-zero
				717	!
				718	! invariants restored (r5 already decremented by 4)
				719	! also r1=num bytes remaining
				720
				721	mov #4, r2 ! 6 EX
				722	mov r4, r7 ! 5 MT (latency=0)
				723
				724	add #0x1c, r5 ! 50 EX (back to -4)
				725	cmp/hs r2, r1 ! 58 MT
				726
				727	bf/s 5f ! 108 BR
				728	add #11, r7 ! 50 EX
				729
				730	mov.l @(r0, r5), r6 ! 21 LS (latency=2)
				731	tst r2, r1 ! 86 MT
				732
				733	mov r5, r3 ! 5 MT (latency=0)
				734	bt/s 4f ! 111 BR
				735
				736	add #-4, r3 ! 50 EX
				737	cmp/hs r2, r1 ! 58 MT
				738
				739	bt/s 5f ! 111 BR
				740	mov.l r6,@-r0 ! 30 LS
				741
				742	! 4 cycles, 2 long words per iteration
				743	3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
				744
				745	4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
				746	cmp/hi r7, r0
				747
				748	mov.l r6, @-r0 ! 30 LS
				749	bt/s 3b ! 109 BR
				750
				751	mov.l r2, @-r0 ! 30 LS
				752
				753	! Copy the final 0-3 bytes
				754
				755	5: cmp/eq r0, r4 ! 54 MT
				756	add #-10, r7 ! 50 EX
				757
				758	bt 9f ! 110 BR
				759	add #3,r5 ! 50 EX
				760
				761	! 3 cycles, 1 byte per iteration
				762	1: mov.b @(r0,r5),r1 ! 19 LS
				763	cmp/hi r7,r0 ! 57 MT
				764
				765	bt/s 1b ! 111 BR
				766	mov.b r1,@-r0 ! 28 LS
				767
				768	9: rts
				769	nop
				770
				771	!
				772	! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
				773	!
				774
				775	.balign 32
				776	.Lcase2:
				777	! Size is 16 or greater and less then 64, but may have trailing bytes
				778
				779	2: mov r5, r6 ! 5 MT (latency=0)
				780	add #-2,r5 ! 50 EX
				781
				782	mov r4,r2 ! 5 MT (latency=0)
				783	add #-4,r6 ! 50 EX
				784
				785	add #7,r2 ! 50 EX
				786	3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
				787
				788	mov.w @(r0,r6),r3 ! 20 LS (latency=2)
				789	cmp/hi r2,r0 ! 57 MT
				790
				791	mov.w r1,@-r0 ! 29 LS
				792	bt/s 3b ! 111 BR
				793
				794	mov.w r3,@-r0 ! 29 LS
				795
				796	bra 10f
				797	nop
				798
				799
				800	.balign 32
				801	.Lcase2b:
				802	! Size is at least 64 bytes, so will be going round the big loop at least once.
				803	!
				804	! r2 = rounded up r4
				805	! r3 = rounded down r0
				806
				807	mov r0, r3 ! 5 MT (latency=0)
				808	mov #(~0x1f), r1 ! 6 EX
				809
				810	and r1, r3 ! 78 EX
				811	mov r4, r2 ! 5 MT (latency=0)
				812
				813	cmp/eq r3, r0 ! 54 MT
				814	add #0x1f, r2 ! 50 EX
				815
				816	add #-2, r5 ! 50 EX
				817	bt/s 1f ! 110 BR
				818	and r1, r2 ! 78 EX
				819
				820	! Copy a short word one at a time until we are cache line aligned
				821	! Normal values: r0, r2, r3, r4
				822	! Unused: r1, r6, r7
				823	! Mod: r5 (=r5-2)
				824	!
				825	add #2, r3 ! 50 EX
				826
				827	2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
				828	cmp/eq r3,r0 ! 54 MT
				829
				830	bf/s 2b ! 111 BR
				831
				832	mov.w r1,@-r0 ! 29 LS
				833
				834	! Copy the cache line aligned blocks
				835	!
				836	! In use: r0, r2, r4, r5 (=r5-2)
				837	! Scratch: r1, r3, r6, r7
				838	!
				839	! We could do this with the four scratch registers, but if src
				840	! and dest hit the same cache line, this will thrash, so make
				841	! use of additional registers.
				842	!
				843	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
				844	! r5: src (was r0+r5)
				845	! r1: dest (was r0)
				846	! this can be reversed at the end, so we don't need to save any extra
				847	! state.
				848	!
				849	1: mov.l r8, @-r15 ! 30 LS
				850	add r0, r5 ! 49 EX
				851
				852	mov.l r9, @-r15 ! 30 LS
				853	mov r0, r1 ! 5 MT (latency=0)
				854
				855	mov.l r10, @-r15 ! 30 LS
				856	add #-0x1e, r5 ! 50 EX
				857
				858	mov.l r11, @-r15 ! 30 LS
				859
				860	mov.l r12, @-r15 ! 30 LS
				861
				862	! 17 cycles, 32 bytes per iteration
				863	#ifdef __LITTLE_ENDIAN__
				864	2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
				865	add #-0x20, r1 ! 50 EX
				866
				867	mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
				868
				869	mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
				870	shll16 r0 ! 103 EX JI..
				871
				872	mov.l @r5+, r7 ! 15 LS (latency=2)
				873	xtrct r3, r0 ! 48 EX LKJI
				874
				875	mov.l @r5+, r8 ! 15 LS (latency=2)
				876	xtrct r6, r3 ! 48 EX PONM
				877
				878	mov.l @r5+, r9 ! 15 LS (latency=2)
				879	xtrct r7, r6 ! 48 EX
				880
				881	mov.l @r5+, r10 ! 15 LS (latency=2)
				882	xtrct r8, r7 ! 48 EX
				883
				884	mov.l @r5+, r11 ! 15 LS (latency=2)
				885	xtrct r9, r8 ! 48 EX
				886
				887	mov.w @r5+, r12 ! 15 LS (latency=2)
				888	xtrct r10, r9 ! 48 EX
				889
				890	movca.l r0,@r1 ! 40 LS (latency=3-7)
				891	xtrct r11, r10 ! 48 EX
				892
				893	mov.l r3, @(0x04,r1) ! 33 LS
				894	xtrct r12, r11 ! 48 EX
				895
				896	mov.l r6, @(0x08,r1) ! 33 LS
				897
				898	mov.l r7, @(0x0c,r1) ! 33 LS
				899
				900	mov.l r8, @(0x10,r1) ! 33 LS
				901	add #-0x40, r5 ! 50 EX
				902
				903	mov.l r9, @(0x14,r1) ! 33 LS
				904	cmp/eq r2,r1 ! 54 MT
				905
				906	mov.l r10, @(0x18,r1) ! 33 LS
				907	bf/s 2b ! 109 BR
				908
				909	mov.l r11, @(0x1c,r1) ! 33 LS
				910	#else
				911	2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
				912	add #-2, r5 ! 50 EX
				913
				914	mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
				915	add #-4, r1 ! 50 EX
				916
				917	mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
				918	shll16 r0 ! 103 EX
				919
				920	mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
				921	xtrct r3, r0 ! 48 EX
				922
				923	mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
				924	xtrct r6, r3 ! 48 EX
				925
				926	mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
				927	xtrct r7, r6 ! 48 EX
				928
				929	mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
				930	xtrct r8, r7 ! 48 EX
				931
				932	mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
				933	xtrct r9, r8 ! 48 EX
				934
				935	mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
				936	xtrct r10, r9 ! 48 EX
				937
				938	movca.l r0,@r1 ! 40 LS (latency=3-7)
				939	add #-0x1c, r1 ! 50 EX
				940
				941	mov.l r3, @(0x18,r1) ! 33 LS
				942	xtrct r11, r10 ! 48 EX
				943
				944	mov.l r6, @(0x14,r1) ! 33 LS
				945	xtrct r12, r11 ! 48 EX
				946
				947	mov.l r7, @(0x10,r1) ! 33 LS
				948
				949	mov.l r8, @(0x0c,r1) ! 33 LS
				950	add #-0x1e, r5 ! 50 EX
				951
				952	mov.l r9, @(0x08,r1) ! 33 LS
				953	cmp/eq r2,r1 ! 54 MT
				954
				955	mov.l r10, @(0x04,r1) ! 33 LS
				956	bf/s 2b ! 109 BR
				957
				958	mov.l r11, @(0x00,r1) ! 33 LS
				959	#endif
				960
				961	mov.l @r15+, r12
				962	mov r1, r0 ! 5 MT (latency=0)
				963
				964	mov.l @r15+, r11 ! 15 LS
				965	sub r1, r5 ! 75 EX
				966
				967	mov.l @r15+, r10 ! 15 LS
				968	cmp/eq r4, r0 ! 54 MT
				969
				970	bf/s 1f ! 109 BR
				971	mov.l @r15+, r9 ! 15 LS
				972
				973	rts
				974	1: mov.l @r15+, r8 ! 15 LS
				975
				976	add #0x1e, r5 ! 50 EX
				977
				978	! Finish off a short word at a time
				979	! r5 must be invariant - 2
				980	10: mov r4,r2 ! 5 MT (latency=0)
				981	add #1,r2 ! 50 EX
				982
				983	cmp/hi r2, r0 ! 57 MT
				984	bf/s 1f ! 109 BR
				985
				986	add #2, r2 ! 50 EX
				987
				988	3: mov.w @(r0,r5),r1 ! 20 LS
				989	cmp/hi r2,r0 ! 57 MT
				990
				991	bt/s 3b ! 109 BR
				992
				993	mov.w r1,@-r0 ! 29 LS
				994	1:
				995
				996	!
				997	! Finally, copy the last byte if necessary
				998	cmp/eq r4,r0 ! 54 MT
				999	bt/s 9b
				1000	add #1,r5
				1001	mov.b @(r0,r5),r1
				1002	rts
				1003	mov.b r1,@-r0
				1004
				1005	END(memcpy)
				1006	libc_hidden_def (memcpy)