blob: 6a229a06c7cdcb357e2baa75a6648edeac710459 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * "memcpy" implementation of SuperH
3 *
4 * Copyright (C) 1999 Niibe Yutaka
5 * Copyright (c) 2002 STMicroelectronics Ltd
6 * Modified from memcpy.S and micro-optimised for SH4
7 * Stuart Menefy (stuart.menefy@st.com)
8 *
9 * Copyright (c) 2009 STMicroelectronics Ltd
10 * Optimised using prefetching and 64bit data transfer via FPU
11 * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
12 */
13
14/*
15 * void *memcpy(void *dst, const void *src, size_t n);
16 *
17 * It is assumed that there is no overlap between src and dst.
18 * If there is an overlap, then the results are undefined.
19 */
20
21#include <sysdep.h>
22#include <endian.h>
23
24#if defined (__LITTLE_ENDIAN__) && defined (__SH_FPU_ANY__)
25#define MEMCPY_USES_FPU
26/* Use paired single precision load or store mode for 64-bit tranfering.
27 * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
28 * Currenlty it has been only implemented and tested for little endian mode. */
29.macro FPU_SET_PAIRED_PREC
30 sts fpscr, r7
31 mov #0x10, r0 ! PR=0 SZ=1
32 shll16 r0
33 lds r0, fpscr
34.endm
35.macro RESTORE_FPSCR
36 lds r7, fpscr
37.endm
38.macro DALLOC
39 ! Cache allocate + store on dst-32.
40 add #-32, r1
41 movca.l r0, @r1
42 add #32, r1
43.endm
44
45#endif
46
47 !
48 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
49 !
50
51 ! Size is 16 or greater, and may have trailing bytes
52
53 .balign 32
54.Lcase1:
55 ! Read a long word and write a long word at once
56 ! At the start of each iteration, r7 contains last long load
57 add #-1,r5 ! 79 EX
58 mov r4,r2 ! 5 MT (0 cycles latency)
59
60 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
61 add #-4,r5 ! 50 EX
62
63 add #7,r2 ! 79 EX
64 !
65#ifdef __LITTLE_ENDIAN__
66 ! 6 cycles, 4 bytes per iteration
673: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
68 mov r7, r3 ! 5 MT (latency=0) ! RQPO
69
70 cmp/hi r2,r0 ! 57 MT
71 shll16 r3 ! 103 EX
72
73 mov r1,r6 ! 5 MT (latency=0)
74 shll8 r3 ! 102 EX ! Oxxx
75
76 shlr8 r6 ! 106 EX ! xNML
77 mov r1, r7 ! 5 MT (latency=0)
78
79 or r6,r3 ! 82 EX ! ONML
80 bt/s 3b ! 109 BR
81
82 mov.l r3,@-r0 ! 30 LS
83#else
843: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
85 mov r7,r3 ! 5 MT (latency=0) ! OPQR
86
87 cmp/hi r2,r0 ! 57 MT
88 shlr16 r3 ! 107 EX
89
90 shlr8 r3 ! 106 EX ! xxxO
91 mov r1,r6 ! 5 MT (latency=0)
92
93 shll8 r6 ! 102 EX ! LMNx
94 mov r1,r7 ! 5 MT (latency=0)
95
96 or r6,r3 ! 82 EX ! LMNO
97 bt/s 3b ! 109 BR
98
99 mov.l r3,@-r0 ! 30 LS
100#endif
101 ! Finally, copy a byte at once, if necessary
102
103 add #4,r5 ! 50 EX
104 cmp/eq r4,r0 ! 54 MT
105
106 add #-6,r2 ! 50 EX
107 bt 9f ! 109 BR
108
1098: cmp/hi r2,r0 ! 57 MT
110 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
111
112 bt/s 8b ! 109 BR
113
114 mov.b r1,@-r0 ! 29 LS
115
1169: rts
117 nop
118
119
120 !
121 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
122 !
123
124 ! Size is 16 or greater, and may have trailing bytes
125
126 .balign 32
127.Lcase3:
128 ! Read a long word and write a long word at once
129 ! At the start of each iteration, r7 contains last long load
130 add #-3,r5 ! 79 EX
131 mov r4,r2 ! 5 MT (0 cycles latency)
132
133 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
134 add #-4,r5 ! 50 EX
135
136 add #7,r2 ! 79 EX
137 !
138#ifdef __LITTLE_ENDIAN__
139 ! 6 cycles, 4 bytes per iteration
1403: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
141 mov r7, r3 ! 5 MT (latency=0) ! RQPO
142
143 cmp/hi r2,r0 ! 57 MT
144 shll8 r3 ! 102 EX ! QPOx
145
146 mov r1,r6 ! 5 MT (latency=0)
147 shlr16 r6 ! 107 EX
148
149 shlr8 r6 ! 106 EX ! xxxN
150 mov r1, r7 ! 5 MT (latency=0)
151
152 or r6,r3 ! 82 EX ! QPON
153 bt/s 3b ! 109 BR
154
155 mov.l r3,@-r0 ! 30 LS
156#else
1573: mov r7,r3 ! OPQR
158 shlr8 r3 ! xOPQ
159 mov.l @(r0,r5),r7 ! KLMN
160 mov r7,r6
161 shll16 r6
162 shll8 r6 ! Nxxx
163 or r6,r3 ! NOPQ
164 cmp/hi r2,r0
165 bt/s 3b
166 mov.l r3,@-r0
167#endif
168
169 ! Finally, copy a byte at once, if necessary
170
171 add #6,r5 ! 50 EX
172 cmp/eq r4,r0 ! 54 MT
173
174 add #-6,r2 ! 50 EX
175 bt 9f ! 109 BR
176
1778: cmp/hi r2,r0 ! 57 MT
178 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
179
180 bt/s 8b ! 109 BR
181
182 mov.b r1,@-r0 ! 29 LS
183
1849: rts
185 nop
186
187ENTRY(memcpy)
188
189 ! Calculate the invariants which will be used in the remainder
190 ! of the code:
191 !
192 ! r4 --> [ ... ] DST [ ... ] SRC
193 ! [ ... ] [ ... ]
194 ! : :
195 ! r0 --> [ ... ] r0+r5 --> [ ... ]
196 !
197 !
198
199 ! Short circuit the common case of src, dst and len being 32 bit aligned
200 ! and test for zero length move
201
202 mov r6, r0 ! 5 MT (0 cycle latency)
203 or r4, r0 ! 82 EX
204
205 or r5, r0 ! 82 EX
206 tst r6, r6 ! 86 MT
207
208 bt/s 99f ! 111 BR (zero len)
209 tst #3, r0 ! 87 MT
210
211 mov r4, r0 ! 5 MT (0 cycle latency)
212 add r6, r0 ! 49 EX
213
214 bt/s .Lcase00 ! 111 BR (aligned)
215 sub r4, r5 ! 75 EX
216
217 ! Arguments are not nicely long word aligned or zero len.
218 ! Check for small copies, and if so do a simple byte at a time copy.
219 !
220 ! Deciding on an exact value of 'small' is not easy, as the point at which
221 ! using the optimised routines become worthwhile varies (these are the
222 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
223 ! size byte-at-time long word byte
224 ! 16 42 39-40 46-50 50-55
225 ! 24 58 43-44 54-58 62-67
226 ! 36 82 49-50 66-70 80-85
227 ! However the penalty for getting it 'wrong' is much higher for long word
228 ! aligned data (and this is more common), so use a value of 16.
229
230 mov #16, r1 ! 6 EX
231 cmp/gt r6,r1 ! 56 MT
232
233 add #-1,r5 ! 50 EX
234 bf/s 6f ! 108 BR (not small)
235
236 mov r5, r3 ! 5 MT (latency=0)
237 shlr r6 ! 104 EX
238
239 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
240 bf/s 4f ! 111 BR
241
242 add #-1,r3 ! 50 EX
243 tst r6, r6 ! 86 MT
244
245 bt/s 98f ! 110 BR
246 mov.b r1,@-r0 ! 29 LS
247
248 ! 4 cycles, 2 bytes per iteration
2493: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
250
2514: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
252 dt r6 ! 67 EX
253
254 mov.b r1,@-r0 ! 29 LS
255 bf/s 3b ! 111 BR
256
257 mov.b r2,@-r0 ! 29 LS
25898:
259 rts
260 nop
261
26299: rts
263 mov r4, r0
264
265 ! Size is not small, so its worthwhile looking for optimisations.
266 ! First align destination to a long word boundary.
267 !
268 ! r5 = normal value -1
269
2706: tst #3, r0 ! 87 MT
271 mov #3, r3 ! 6 EX
272
273 bt/s 2f ! 111 BR
274 and r0,r3 ! 78 EX
275
276 ! 3 cycles, 1 byte per iteration
2771: dt r3 ! 67 EX
278 mov.b @(r0,r5),r1 ! 19 LS (latency=2)
279
280 add #-1, r6 ! 79 EX
281 bf/s 1b ! 109 BR
282
283 mov.b r1,@-r0 ! 28 LS
284
2852: add #1, r5 ! 79 EX
286
287 ! Now select the appropriate bulk transfer code based on relative
288 ! alignment of src and dst.
289
290 mov r0, r3 ! 5 MT (latency=0)
291
292 mov r5, r0 ! 5 MT (latency=0)
293 tst #1, r0 ! 87 MT
294
295 bf/s 1f ! 111 BR
296 mov #64, r7 ! 6 EX
297
298 ! bit 0 clear
299
300 cmp/ge r7, r6 ! 55 MT
301
302 bt/s 2f ! 111 BR
303 tst #2, r0 ! 87 MT
304
305 ! small
306 bt/s .Lcase0
307 mov r3, r0
308
309 bra .Lcase2
310 nop
311
312 ! big
3132: bt/s .Lcase0b
314 mov r3, r0
315
316 bra .Lcase2b
317 nop
318
319 ! bit 0 set
3201: tst #2, r0 ! 87 MT
321
322 bt/s .Lcase1
323 mov r3, r0
324
325 bra .Lcase3
326 nop
327
328
329 !
330 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
331 !
332
333 ! src, dst and size are all long word aligned
334 ! size is non-zero
335
336 .balign 32
337.Lcase00:
338 mov #64, r1 ! 6 EX
339 mov r5, r3 ! 5 MT (latency=0)
340
341 cmp/gt r6, r1 ! 56 MT
342 add #-4, r5 ! 50 EX
343
344 bf .Lcase00b ! 108 BR (big loop)
345 shlr2 r6 ! 105 EX
346
347 shlr r6 ! 104 EX
348 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
349
350 bf/s 4f ! 111 BR
351 add #-8, r3 ! 50 EX
352
353 tst r6, r6 ! 86 MT
354 bt/s 5f ! 110 BR
355
356 mov.l r1,@-r0 ! 30 LS
357
358 ! 4 cycles, 2 long words per iteration
3593: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
360
3614: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
362 dt r6 ! 67 EX
363
364 mov.l r1, @-r0 ! 30 LS
365 bf/s 3b ! 109 BR
366
367 mov.l r2, @-r0 ! 30 LS
368
3695: rts
370 nop
371
372
373 ! Size is 16 or greater and less than 64, but may have trailing bytes
374
375 .balign 32
376.Lcase0:
377 add #-4, r5 ! 50 EX
378 mov r4, r7 ! 5 MT (latency=0)
379
380 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
381 mov #4, r2 ! 6 EX
382
383 add #11, r7 ! 50 EX
384 tst r2, r6 ! 86 MT
385
386 mov r5, r3 ! 5 MT (latency=0)
387 bt/s 4f ! 111 BR
388
389 add #-4, r3 ! 50 EX
390 mov.l r1,@-r0 ! 30 LS
391
392 ! 4 cycles, 2 long words per iteration
3933: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
394
3954: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
396 cmp/hi r7, r0
397
398 mov.l r1, @-r0 ! 30 LS
399 bt/s 3b ! 109 BR
400
401 mov.l r2, @-r0 ! 30 LS
402
403 ! Copy the final 0-3 bytes
404
405 add #3,r5 ! 50 EX
406
407 cmp/eq r0, r4 ! 54 MT
408 add #-10, r7 ! 50 EX
409
410 bt 9f ! 110 BR
411
412 ! 3 cycles, 1 byte per iteration
4131: mov.b @(r0,r5),r1 ! 19 LS
414 cmp/hi r7,r0 ! 57 MT
415
416 bt/s 1b ! 111 BR
417 mov.b r1,@-r0 ! 28 LS
418
4199: rts
420 nop
421
422 ! Size is at least 64 bytes, so will be going round the big loop at least once.
423 !
424 ! r2 = rounded up r4
425 ! r3 = rounded down r0
426
427 .balign 32
428.Lcase0b:
429 add #-4, r5 ! 50 EX
430
431.Lcase00b:
432 mov r0, r3 ! 5 MT (latency=0)
433 mov #(~0x1f), r1 ! 6 EX
434
435 and r1, r3 ! 78 EX
436 mov r4, r2 ! 5 MT (latency=0)
437
438 cmp/eq r3, r0 ! 54 MT
439 add #0x1f, r2 ! 50 EX
440
441 bt/s 1f ! 110 BR
442 and r1, r2 ! 78 EX
443
444 ! copy initial words until cache line aligned
445
446 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
447 tst #4, r0 ! 87 MT
448
449 mov r5, r6 ! 5 MT (latency=0)
450 add #-4, r6 ! 50 EX
451
452 bt/s 4f ! 111 BR
453 add #8, r3 ! 50 EX
454
455 tst #0x18, r0 ! 87 MT
456
457 bt/s 1f ! 109 BR
458 mov.l r1,@-r0 ! 30 LS
459
460 ! 4 cycles, 2 long words per iteration
4613: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
462
4634: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
464 cmp/eq r3, r0 ! 54 MT
465
466 mov.l r1, @-r0 ! 30 LS
467 bf/s 3b ! 109 BR
468
469 mov.l r7, @-r0 ! 30 LS
470
471#ifdef MEMCPY_USES_FPU
472 ! Copy the cache line aligned blocks by using the FPU registers.
473 ! If src and dst are well aligned adopt 64-bit data transfer.
474 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
475 ! r5: src (was r0+r5)
476 ! r1: dest (was r0)
4771:
478 add r0, r5
479 mov r0, r1
480
481 mov r1, r3 ! MT
482 sub r2, r3 ! EX (r3 - r2 -> r3)
483 mov #-5, r0
484 shld r0, r3 ! number of the cache lines
485
486 mov #8, r0
487 cmp/ge r0, r3 ! Check if there are many cache lines to copy.
488 bf 45f ! Copy cache line aligned blocks without pref.
489 mov r5, r0
490 add #-0x7c, r0
491 tst #7, r0 ! src is 8byte aligned
492 bf 45f
493
494 ! Many cache lines have to be copied and the buffers are well aligned.
495 ! Aggressive prefetching and FPU in single paired precision.
496 mov r0, r5
497 mov r5, r6
498 add #-0x80, r6 ! prefetch head
499
500 ! store FPU (in single precision mode, do not check R15 align).
501 fmov fr12, @-r15
502 fmov fr13, @-r15
503 fmov fr14, @-r15
504 fmov fr15, @-r15
505
506 FPU_SET_PAIRED_PREC
507
508 mov #4, r0
50967:
510 add #-0x20, r6
511 pref @r6
512 add #-0x20, r6
513 pref @r6
514
515 fmov @r5+, dr0
516 fmov @r5+, dr2
517 fmov @r5+, dr4
518 fmov @r5+, dr6
519 fmov @r5+, dr8
520 fmov @r5+, dr10
521 fmov @r5+, dr12
522 fmov @r5+, dr14
523 fmov @r5+, xd0
524 fmov @r5+, xd2
525 fmov @r5+, xd4
526 fmov @r5+, xd6
527 fmov @r5+, xd8
528 fmov @r5+, xd10
529 fmov @r5+, xd12
530 fmov @r5+, xd14
531
532 DALLOC
533 fmov xd14, @-r1
534 fmov xd12, @-r1
535 fmov xd10, @-r1
536 fmov xd8, @-r1
537 DALLOC
538 fmov xd6, @-r1
539 fmov xd4, @-r1
540 fmov xd2, @-r1
541 fmov xd0, @-r1
542 DALLOC
543 fmov dr14, @-r1
544 fmov dr12, @-r1
545 fmov dr10, @-r1
546 fmov dr8, @-r1
547 DALLOC
548 fmov dr6, @-r1
549 add #-0x80, r5
550 fmov dr4, @-r1
551 add #-0x80, r5
552 fmov dr2, @-r1
553 add #-0x20, r6
554 fmov dr0, @-r1
555 add #-4, r3
556 pref @r6
557 add #-0x20, r6
558 cmp/ge r0, r3
559 bt/s 67b
560 pref @r6
561
562 RESTORE_FPSCR
563
564 ! Restore FPU callee save registers
565 fmov @r15+, fr15
566 fmov @r15+, fr14
567 fmov @r15+, fr13
568 fmov @r15+, fr12
569
570 ! Other cache lines could be copied: so use the FPU in single paired
571 ! precision without prefetching. No check for alignment is necessary.
572
573 mov #1, r0
574 cmp/ge r0, r3
575 bt/s 3f
576 add #0x60, r5
577
578 bra 5f
579 nop
580
581 ! No prefetch and FPU in single precision.
58245:
583 add #-0x1c, r5
584 mov r5, r0
585 tst #7, r0
586 bt 3f
587
5882: fmov.s @r5+, fr0
589 fmov.s @r5+, fr1
590 fmov.s @r5+, fr2
591 fmov.s @r5+, fr3
592 fmov.s @r5+, fr4
593 fmov.s @r5+, fr5
594 fmov.s @r5+, fr6
595 fmov.s @r5+, fr7
596
597 DALLOC
598
599 fmov.s fr7, @-r1
600 fmov.s fr6, @-r1
601 fmov.s fr5, @-r1
602 fmov.s fr4, @-r1
603 fmov.s fr3, @-r1
604 fmov.s fr2, @-r1
605 fmov.s fr1, @-r1
606 fmov.s fr0, @-r1
607
608 cmp/eq r2,r1
609
610 bf/s 2b
611 add #-0x40, r5
612
613 bra 5f
614 nop
615
616 ! No prefetch and FPU in single paired precision.
617
6183: FPU_SET_PAIRED_PREC
619
6204: fmov @r5+, dr0
621 fmov @r5+, dr2
622 fmov @r5+, dr4
623 fmov @r5+, dr6
624
625 DALLOC
626
627 fmov dr6, @-r1
628 fmov dr4, @-r1
629 fmov dr2, @-r1
630 fmov dr0, @-r1
631 cmp/eq r2,r1
632
633 bf/s 4b
634 add #-0x40, r5
635
636 RESTORE_FPSCR
637
6385: mov r1, r0
639
640 cmp/eq r4, r0 ! 54 MT
641 bf/s 1f ! 109 BR
642 sub r1, r5 ! 75 EX
643
644 rts
645 nop
6461:
647#else
648 ! Copy the cache line aligned blocks
649 !
650 ! In use: r0, r2, r4, r5
651 ! Scratch: r1, r3, r6, r7
652 !
653 ! We could do this with the four scratch registers, but if src
654 ! and dest hit the same cache line, this will thrash, so make
655 ! use of additional registers.
656 !
657 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
658 ! r5: src (was r0+r5)
659 ! r1: dest (was r0)
660 ! this can be reversed at the end, so we don't need to save any extra
661 ! state.
662 !
6631: mov.l r8, @-r15 ! 30 LS
664 add r0, r5 ! 49 EX
665
666 mov.l r9, @-r15 ! 30 LS
667 mov r0, r1 ! 5 MT (latency=0)
668
669 mov.l r10, @-r15 ! 30 LS
670 add #-0x1c, r5 ! 50 EX
671
672 mov.l r11, @-r15 ! 30 LS
673
674 ! 16 cycles, 32 bytes per iteration
6752: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
676 add #-0x20, r1 ! 50 EX
677 mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
678 mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
679 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
680 mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
681 mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
682 mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
683 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
684 movca.l r0,@r1 ! 40 LS (latency=3-7)
685 mov.l r3,@(0x04,r1) ! 33 LS
686 mov.l r6,@(0x08,r1) ! 33 LS
687 mov.l r7,@(0x0c,r1) ! 33 LS
688
689 mov.l r8,@(0x10,r1) ! 33 LS
690 add #-0x20, r5 ! 50 EX
691
692 mov.l r9,@(0x14,r1) ! 33 LS
693 cmp/eq r2,r1 ! 54 MT
694
695 mov.l r10,@(0x18,r1) ! 33 LS
696 bf/s 2b ! 109 BR
697
698 mov.l r11,@(0x1c,r1) ! 33 LS
699
700 mov r1, r0 ! 5 MT (latency=0)
701
702 mov.l @r15+, r11 ! 15 LS
703 sub r1, r5 ! 75 EX
704
705 mov.l @r15+, r10 ! 15 LS
706 cmp/eq r4, r0 ! 54 MT
707
708 bf/s 1f ! 109 BR
709 mov.l @r15+, r9 ! 15 LS
710
711 rts
7121: mov.l @r15+, r8 ! 15 LS
713#endif
714 sub r4, r1 ! 75 EX (len remaining)
715
716 ! number of trailing bytes is non-zero
717 !
718 ! invariants restored (r5 already decremented by 4)
719 ! also r1=num bytes remaining
720
721 mov #4, r2 ! 6 EX
722 mov r4, r7 ! 5 MT (latency=0)
723
724 add #0x1c, r5 ! 50 EX (back to -4)
725 cmp/hs r2, r1 ! 58 MT
726
727 bf/s 5f ! 108 BR
728 add #11, r7 ! 50 EX
729
730 mov.l @(r0, r5), r6 ! 21 LS (latency=2)
731 tst r2, r1 ! 86 MT
732
733 mov r5, r3 ! 5 MT (latency=0)
734 bt/s 4f ! 111 BR
735
736 add #-4, r3 ! 50 EX
737 cmp/hs r2, r1 ! 58 MT
738
739 bt/s 5f ! 111 BR
740 mov.l r6,@-r0 ! 30 LS
741
742 ! 4 cycles, 2 long words per iteration
7433: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
744
7454: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
746 cmp/hi r7, r0
747
748 mov.l r6, @-r0 ! 30 LS
749 bt/s 3b ! 109 BR
750
751 mov.l r2, @-r0 ! 30 LS
752
753 ! Copy the final 0-3 bytes
754
7555: cmp/eq r0, r4 ! 54 MT
756 add #-10, r7 ! 50 EX
757
758 bt 9f ! 110 BR
759 add #3,r5 ! 50 EX
760
761 ! 3 cycles, 1 byte per iteration
7621: mov.b @(r0,r5),r1 ! 19 LS
763 cmp/hi r7,r0 ! 57 MT
764
765 bt/s 1b ! 111 BR
766 mov.b r1,@-r0 ! 28 LS
767
7689: rts
769 nop
770
771 !
772 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
773 !
774
775 .balign 32
776.Lcase2:
777 ! Size is 16 or greater and less then 64, but may have trailing bytes
778
7792: mov r5, r6 ! 5 MT (latency=0)
780 add #-2,r5 ! 50 EX
781
782 mov r4,r2 ! 5 MT (latency=0)
783 add #-4,r6 ! 50 EX
784
785 add #7,r2 ! 50 EX
7863: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
787
788 mov.w @(r0,r6),r3 ! 20 LS (latency=2)
789 cmp/hi r2,r0 ! 57 MT
790
791 mov.w r1,@-r0 ! 29 LS
792 bt/s 3b ! 111 BR
793
794 mov.w r3,@-r0 ! 29 LS
795
796 bra 10f
797 nop
798
799
800 .balign 32
801.Lcase2b:
802 ! Size is at least 64 bytes, so will be going round the big loop at least once.
803 !
804 ! r2 = rounded up r4
805 ! r3 = rounded down r0
806
807 mov r0, r3 ! 5 MT (latency=0)
808 mov #(~0x1f), r1 ! 6 EX
809
810 and r1, r3 ! 78 EX
811 mov r4, r2 ! 5 MT (latency=0)
812
813 cmp/eq r3, r0 ! 54 MT
814 add #0x1f, r2 ! 50 EX
815
816 add #-2, r5 ! 50 EX
817 bt/s 1f ! 110 BR
818 and r1, r2 ! 78 EX
819
820 ! Copy a short word one at a time until we are cache line aligned
821 ! Normal values: r0, r2, r3, r4
822 ! Unused: r1, r6, r7
823 ! Mod: r5 (=r5-2)
824 !
825 add #2, r3 ! 50 EX
826
8272: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
828 cmp/eq r3,r0 ! 54 MT
829
830 bf/s 2b ! 111 BR
831
832 mov.w r1,@-r0 ! 29 LS
833
834 ! Copy the cache line aligned blocks
835 !
836 ! In use: r0, r2, r4, r5 (=r5-2)
837 ! Scratch: r1, r3, r6, r7
838 !
839 ! We could do this with the four scratch registers, but if src
840 ! and dest hit the same cache line, this will thrash, so make
841 ! use of additional registers.
842 !
843 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
844 ! r5: src (was r0+r5)
845 ! r1: dest (was r0)
846 ! this can be reversed at the end, so we don't need to save any extra
847 ! state.
848 !
8491: mov.l r8, @-r15 ! 30 LS
850 add r0, r5 ! 49 EX
851
852 mov.l r9, @-r15 ! 30 LS
853 mov r0, r1 ! 5 MT (latency=0)
854
855 mov.l r10, @-r15 ! 30 LS
856 add #-0x1e, r5 ! 50 EX
857
858 mov.l r11, @-r15 ! 30 LS
859
860 mov.l r12, @-r15 ! 30 LS
861
862 ! 17 cycles, 32 bytes per iteration
863#ifdef __LITTLE_ENDIAN__
8642: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
865 add #-0x20, r1 ! 50 EX
866
867 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
868
869 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
870 shll16 r0 ! 103 EX JI..
871
872 mov.l @r5+, r7 ! 15 LS (latency=2)
873 xtrct r3, r0 ! 48 EX LKJI
874
875 mov.l @r5+, r8 ! 15 LS (latency=2)
876 xtrct r6, r3 ! 48 EX PONM
877
878 mov.l @r5+, r9 ! 15 LS (latency=2)
879 xtrct r7, r6 ! 48 EX
880
881 mov.l @r5+, r10 ! 15 LS (latency=2)
882 xtrct r8, r7 ! 48 EX
883
884 mov.l @r5+, r11 ! 15 LS (latency=2)
885 xtrct r9, r8 ! 48 EX
886
887 mov.w @r5+, r12 ! 15 LS (latency=2)
888 xtrct r10, r9 ! 48 EX
889
890 movca.l r0,@r1 ! 40 LS (latency=3-7)
891 xtrct r11, r10 ! 48 EX
892
893 mov.l r3, @(0x04,r1) ! 33 LS
894 xtrct r12, r11 ! 48 EX
895
896 mov.l r6, @(0x08,r1) ! 33 LS
897
898 mov.l r7, @(0x0c,r1) ! 33 LS
899
900 mov.l r8, @(0x10,r1) ! 33 LS
901 add #-0x40, r5 ! 50 EX
902
903 mov.l r9, @(0x14,r1) ! 33 LS
904 cmp/eq r2,r1 ! 54 MT
905
906 mov.l r10, @(0x18,r1) ! 33 LS
907 bf/s 2b ! 109 BR
908
909 mov.l r11, @(0x1c,r1) ! 33 LS
910#else
9112: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
912 add #-2, r5 ! 50 EX
913
914 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
915 add #-4, r1 ! 50 EX
916
917 mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
918 shll16 r0 ! 103 EX
919
920 mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
921 xtrct r3, r0 ! 48 EX
922
923 mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
924 xtrct r6, r3 ! 48 EX
925
926 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
927 xtrct r7, r6 ! 48 EX
928
929 mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
930 xtrct r8, r7 ! 48 EX
931
932 mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
933 xtrct r9, r8 ! 48 EX
934
935 mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
936 xtrct r10, r9 ! 48 EX
937
938 movca.l r0,@r1 ! 40 LS (latency=3-7)
939 add #-0x1c, r1 ! 50 EX
940
941 mov.l r3, @(0x18,r1) ! 33 LS
942 xtrct r11, r10 ! 48 EX
943
944 mov.l r6, @(0x14,r1) ! 33 LS
945 xtrct r12, r11 ! 48 EX
946
947 mov.l r7, @(0x10,r1) ! 33 LS
948
949 mov.l r8, @(0x0c,r1) ! 33 LS
950 add #-0x1e, r5 ! 50 EX
951
952 mov.l r9, @(0x08,r1) ! 33 LS
953 cmp/eq r2,r1 ! 54 MT
954
955 mov.l r10, @(0x04,r1) ! 33 LS
956 bf/s 2b ! 109 BR
957
958 mov.l r11, @(0x00,r1) ! 33 LS
959#endif
960
961 mov.l @r15+, r12
962 mov r1, r0 ! 5 MT (latency=0)
963
964 mov.l @r15+, r11 ! 15 LS
965 sub r1, r5 ! 75 EX
966
967 mov.l @r15+, r10 ! 15 LS
968 cmp/eq r4, r0 ! 54 MT
969
970 bf/s 1f ! 109 BR
971 mov.l @r15+, r9 ! 15 LS
972
973 rts
9741: mov.l @r15+, r8 ! 15 LS
975
976 add #0x1e, r5 ! 50 EX
977
978 ! Finish off a short word at a time
979 ! r5 must be invariant - 2
98010: mov r4,r2 ! 5 MT (latency=0)
981 add #1,r2 ! 50 EX
982
983 cmp/hi r2, r0 ! 57 MT
984 bf/s 1f ! 109 BR
985
986 add #2, r2 ! 50 EX
987
9883: mov.w @(r0,r5),r1 ! 20 LS
989 cmp/hi r2,r0 ! 57 MT
990
991 bt/s 3b ! 109 BR
992
993 mov.w r1,@-r0 ! 29 LS
9941:
995
996 !
997 ! Finally, copy the last byte if necessary
998 cmp/eq r4,r0 ! 54 MT
999 bt/s 9b
1000 add #1,r5
1001 mov.b @(r0,r5),r1
1002 rts
1003 mov.b r1,@-r0
1004
1005END(memcpy)
1006libc_hidden_def (memcpy)