blob: 470784ecd0ce6d7daad3e6fd49121f489b1570a8 [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11! ARG1: source pointer
12! ARG2: byte count
13!
14! Exit: RESULT: destination pointer
15! any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18! unfortunately it is difficult in some cases to concatanate bytes
19! into a longword on the SH, so this does a longword read and small
20! writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25! copied is unsigned greater than the address of the first byte to
26! be copied. This could be easily swapped for a signed comparison,
27! but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30! bytes memory chunk to b copied, the rest of the word can be read
31! without side effects.
32! This could be easily changed by increasing the minumum size of
33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34! however, this would cost a few extra cyles on average.
35! For SHmedia, the assumption is that any quadword can be read in its
36! enirety if at least one byte is included in the copy.
37!
38
39#include <features.h>
40
41 .section .text..SHmedia32,"ax"
42 .globl memcpy
43 .type memcpy, @function
44 .align 5
45
46memcpy:
47
48#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
49#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
50#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
51#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
52
53 ld.b r3,0,r63
54 pta/l Large,tr0
55 movi 25,r0
56 bgeu/u r4,r0,tr0
57 nsb r4,r0
58 shlli r0,5,r0
59 movi (L1-L0+63*32 + 1) & 0xffff,r1
60 sub r1, r0, r0
61L0: ptrel r0,tr0
62 add r2,r4,r5
63 ptabs r18,tr1
64 add r3,r4,r6
65 blink tr0,r63
66
67/* Rearranged to make cut2 safe */
68 .balign 8
69L4_7: /* 4..7 byte memcpy cntd. */
70 stlo.l r2, 0, r0
71 or r6, r7, r6
72 sthi.l r5, -1, r6
73 stlo.l r5, -4, r6
74 blink tr1,r63
75
76 .balign 8
77L1: /* 0 byte memcpy */
78 nop
79 blink tr1,r63
80 nop
81 nop
82 nop
83 nop
84
85L2_3: /* 2 or 3 byte memcpy cntd. */
86 st.b r5,-1,r6
87 blink tr1,r63
88
89 /* 1 byte memcpy */
90 ld.b r3,0,r0
91 st.b r2,0,r0
92 blink tr1,r63
93
94L8_15: /* 8..15 byte memcpy cntd. */
95 stlo.q r2, 0, r0
96 or r6, r7, r6
97 sthi.q r5, -1, r6
98 stlo.q r5, -8, r6
99 blink tr1,r63
100
101 /* 2 or 3 byte memcpy */
102 ld.b r3,0,r0
103 ld.b r2,0,r63
104 ld.b r3,1,r1
105 st.b r2,0,r0
106 pta/l L2_3,tr0
107 ld.b r6,-1,r6
108 st.b r2,1,r1
109 blink tr0, r63
110
111 /* 4 .. 7 byte memcpy */
112 LDUAL (r3, 0, r0, r1)
113 pta L4_7, tr0
114 ldlo.l r6, -4, r7
115 or r0, r1, r0
116 sthi.l r2, 3, r0
117 ldhi.l r6, -1, r6
118 blink tr0, r63
119
120 /* 8 .. 15 byte memcpy */
121 LDUAQ (r3, 0, r0, r1)
122 pta L8_15, tr0
123 ldlo.q r6, -8, r7
124 or r0, r1, r0
125 sthi.q r2, 7, r0
126 ldhi.q r6, -1, r6
127 blink tr0, r63
128
129 /* 16 .. 24 byte memcpy */
130 LDUAQ (r3, 0, r0, r1)
131 LDUAQ (r3, 8, r8, r9)
132 or r0, r1, r0
133 sthi.q r2, 7, r0
134 or r8, r9, r8
135 sthi.q r2, 15, r8
136 ldlo.q r6, -8, r7
137 ldhi.q r6, -1, r6
138 stlo.q r2, 8, r8
139 stlo.q r2, 0, r0
140 or r6, r7, r6
141 sthi.q r5, -1, r6
142 stlo.q r5, -8, r6
143 blink tr1,r63
144
145Large:
146 ld.b r2, 0, r63
147 pta/l Loop_ua, tr1
148 ori r3, -8, r7
149 sub r2, r7, r22
150 sub r3, r2, r6
151 add r2, r4, r5
152 ldlo.q r3, 0, r0
153 addi r5, -16, r5
154 movi 64+8, r27 /* could subtract r7 from that. */
155 stlo.q r2, 0, r0
156 sthi.q r2, 7, r0
157 ldx.q r22, r6, r0
158 bgtu/l r27, r4, tr1
159
160 addi r5, -48, r27
161 pta/l Loop_line, tr0
162 addi r6, 64, r36
163 addi r6, -24, r19
164 addi r6, -16, r20
165 addi r6, -8, r21
166
167Loop_line:
168 ldx.q r22, r36, r63
169 alloco r22, 32
170 addi r22, 32, r22
171 ldx.q r22, r19, r23
172 sthi.q r22, -25, r0
173 ldx.q r22, r20, r24
174 ldx.q r22, r21, r25
175 stlo.q r22, -32, r0
176 ldx.q r22, r6, r0
177 sthi.q r22, -17, r23
178 sthi.q r22, -9, r24
179 sthi.q r22, -1, r25
180 stlo.q r22, -24, r23
181 stlo.q r22, -16, r24
182 stlo.q r22, -8, r25
183 bgeu r27, r22, tr0
184
185Loop_ua:
186 addi r22, 8, r22
187 sthi.q r22, -1, r0
188 stlo.q r22, -8, r0
189 ldx.q r22, r6, r0
190 bgtu/l r5, r22, tr1
191
192 add r3, r4, r7
193 ldlo.q r7, -8, r1
194 sthi.q r22, 7, r0
195 ldhi.q r7, -1, r7
196 ptabs r18,tr1
197 stlo.q r22, 0, r0
198 or r1, r7, r1
199 sthi.q r5, 15, r1
200 stlo.q r5, 8, r1
201 blink tr1, r63
202
203 .size memcpy,.-memcpy
204
205libc_hidden_def(memcpy)