blob: 29233f566d2f79a4ddd52eaeb9fe352835809c23 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32 .globl memmove
33 .type memmove, @function
34 .ent memmove
35
36memmove:
37 cmpu r4, r5, r6 /* n = s - d */
38 bgei r4, HIDDEN_JUMPTARGET(memcpy)
39
40fast_memcpy_descending:
41 /* move d to return register as value of function */
42 addi r3, r5, 0
43
44 add r5, r5, r7 /* d = d + c */
45 add r6, r6, r7 /* s = s + c */
46
47 addi r4, r0, 4 /* n = 4 */
48 cmpu r4, r4, r7 /* n = c - n (unsigned) */
49 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
50
51 /* transfer first 0~3 bytes to get aligned dest address */
52 andi r4, r5, 3 /* n = d & 3 */
53 /* if zero, destination already aligned */
54 beqi r4,d_dalign_done
55 rsub r7, r4, r7 /* c = c - n adjust c */
56
57d_xfer_first_loop:
58 /* if no bytes left to transfer, transfer the bulk */
59 beqi r4,d_dalign_done
60 addi r6, r6, -1 /* s-- */
61 addi r5, r5, -1 /* d-- */
62 lbui r11, r6, 0 /* h = *s */
63 sbi r11, r5, 0 /* *d = h */
64 brid d_xfer_first_loop /* loop */
65 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
66
67d_dalign_done:
68 addi r4, r0, 32 /* n = 32 */
69 cmpu r4, r4, r7 /* n = c - n (unsigned) */
70 /* if n < 0, less than one block to transfer */
71 blti r4, d_block_done
72
73d_block_xfer:
74 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
75 rsub r7, r4, r7 /* c = c - n */
76
77 andi r9, r6, 3 /* t1 = s & 3 */
78 /* if temp != 0, unaligned transfers needed */
79 bnei r9, d_block_unaligned
80
81d_block_aligned:
82 addi r6, r6, -32 /* s = s - 32 */
83 addi r5, r5, -32 /* d = d - 32 */
84 lwi r9, r6, 28 /* t1 = *(s + 28) */
85 lwi r10, r6, 24 /* t2 = *(s + 24) */
86 lwi r11, r6, 20 /* t3 = *(s + 20) */
87 lwi r12, r6, 16 /* t4 = *(s + 16) */
88 swi r9, r5, 28 /* *(d + 28) = t1 */
89 swi r10, r5, 24 /* *(d + 24) = t2 */
90 swi r11, r5, 20 /* *(d + 20) = t3 */
91 swi r12, r5, 16 /* *(d + 16) = t4 */
92 lwi r9, r6, 12 /* t1 = *(s + 12) */
93 lwi r10, r6, 8 /* t2 = *(s + 8) */
94 lwi r11, r6, 4 /* t3 = *(s + 4) */
95 lwi r12, r6, 0 /* t4 = *(s + 0) */
96 swi r9, r5, 12 /* *(d + 12) = t1 */
97 swi r10, r5, 8 /* *(d + 8) = t2 */
98 swi r11, r5, 4 /* *(d + 4) = t3 */
99 addi r4, r4, -32 /* n = n - 32 */
100 bneid r4, d_block_aligned /* while (n) loop */
101 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
102 bri d_block_done
103
104d_block_unaligned:
105 andi r8, r6, 0xfffffffc /* as = s & ~3 */
106 rsub r6, r4, r6 /* s = s - n */
107 lwi r11, r8, 0 /* h = *(as + 0) */
108
109 addi r9, r9, -1
110 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
111 addi r9, r9, -1
112 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
113
114d_block_u3:
115 bsrli r11, r11, 8 /* h = h >> 8 */
116d_bu3_loop:
117 addi r8, r8, -32 /* as = as - 32 */
118 addi r5, r5, -32 /* d = d - 32 */
119 lwi r12, r8, 28 /* v = *(as + 28) */
120 bslli r9, r12, 24 /* t1 = v << 24 */
121 or r9, r11, r9 /* t1 = h | t1 */
122 swi r9, r5, 28 /* *(d + 28) = t1 */
123 bsrli r11, r12, 8 /* h = v >> 8 */
124 lwi r12, r8, 24 /* v = *(as + 24) */
125 bslli r9, r12, 24 /* t1 = v << 24 */
126 or r9, r11, r9 /* t1 = h | t1 */
127 swi r9, r5, 24 /* *(d + 24) = t1 */
128 bsrli r11, r12, 8 /* h = v >> 8 */
129 lwi r12, r8, 20 /* v = *(as + 20) */
130 bslli r9, r12, 24 /* t1 = v << 24 */
131 or r9, r11, r9 /* t1 = h | t1 */
132 swi r9, r5, 20 /* *(d + 20) = t1 */
133 bsrli r11, r12, 8 /* h = v >> 8 */
134 lwi r12, r8, 16 /* v = *(as + 16) */
135 bslli r9, r12, 24 /* t1 = v << 24 */
136 or r9, r11, r9 /* t1 = h | t1 */
137 swi r9, r5, 16 /* *(d + 16) = t1 */
138 bsrli r11, r12, 8 /* h = v >> 8 */
139 lwi r12, r8, 12 /* v = *(as + 12) */
140 bslli r9, r12, 24 /* t1 = v << 24 */
141 or r9, r11, r9 /* t1 = h | t1 */
142 swi r9, r5, 12 /* *(d + 112) = t1 */
143 bsrli r11, r12, 8 /* h = v >> 8 */
144 lwi r12, r8, 8 /* v = *(as + 8) */
145 bslli r9, r12, 24 /* t1 = v << 24 */
146 or r9, r11, r9 /* t1 = h | t1 */
147 swi r9, r5, 8 /* *(d + 8) = t1 */
148 bsrli r11, r12, 8 /* h = v >> 8 */
149 lwi r12, r8, 4 /* v = *(as + 4) */
150 bslli r9, r12, 24 /* t1 = v << 24 */
151 or r9, r11, r9 /* t1 = h | t1 */
152 swi r9, r5, 4 /* *(d + 4) = t1 */
153 bsrli r11, r12, 8 /* h = v >> 8 */
154 lwi r12, r8, 0 /* v = *(as + 0) */
155 bslli r9, r12, 24 /* t1 = v << 24 */
156 or r9, r11, r9 /* t1 = h | t1 */
157 swi r9, r5, 0 /* *(d + 0) = t1 */
158 addi r4, r4, -32 /* n = n - 32 */
159 bneid r4, d_bu3_loop /* while (n) loop */
160 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
161 bri d_block_done
162
163d_block_u1:
164 bsrli r11, r11, 24 /* h = h >> 24 */
165d_bu1_loop:
166 addi r8, r8, -32 /* as = as - 32 */
167 addi r5, r5, -32 /* d = d - 32 */
168 lwi r12, r8, 28 /* v = *(as + 28) */
169 bslli r9, r12, 8 /* t1 = v << 8 */
170 or r9, r11, r9 /* t1 = h | t1 */
171 swi r9, r5, 28 /* *(d + 28) = t1 */
172 bsrli r11, r12, 24 /* h = v >> 24 */
173 lwi r12, r8, 24 /* v = *(as + 24) */
174 bslli r9, r12, 8 /* t1 = v << 8 */
175 or r9, r11, r9 /* t1 = h | t1 */
176 swi r9, r5, 24 /* *(d + 24) = t1 */
177 bsrli r11, r12, 24 /* h = v >> 24 */
178 lwi r12, r8, 20 /* v = *(as + 20) */
179 bslli r9, r12, 8 /* t1 = v << 8 */
180 or r9, r11, r9 /* t1 = h | t1 */
181 swi r9, r5, 20 /* *(d + 20) = t1 */
182 bsrli r11, r12, 24 /* h = v >> 24 */
183 lwi r12, r8, 16 /* v = *(as + 16) */
184 bslli r9, r12, 8 /* t1 = v << 8 */
185 or r9, r11, r9 /* t1 = h | t1 */
186 swi r9, r5, 16 /* *(d + 16) = t1 */
187 bsrli r11, r12, 24 /* h = v >> 24 */
188 lwi r12, r8, 12 /* v = *(as + 12) */
189 bslli r9, r12, 8 /* t1 = v << 8 */
190 or r9, r11, r9 /* t1 = h | t1 */
191 swi r9, r5, 12 /* *(d + 112) = t1 */
192 bsrli r11, r12, 24 /* h = v >> 24 */
193 lwi r12, r8, 8 /* v = *(as + 8) */
194 bslli r9, r12, 8 /* t1 = v << 8 */
195 or r9, r11, r9 /* t1 = h | t1 */
196 swi r9, r5, 8 /* *(d + 8) = t1 */
197 bsrli r11, r12, 24 /* h = v >> 24 */
198 lwi r12, r8, 4 /* v = *(as + 4) */
199 bslli r9, r12, 8 /* t1 = v << 8 */
200 or r9, r11, r9 /* t1 = h | t1 */
201 swi r9, r5, 4 /* *(d + 4) = t1 */
202 bsrli r11, r12, 24 /* h = v >> 24 */
203 lwi r12, r8, 0 /* v = *(as + 0) */
204 bslli r9, r12, 8 /* t1 = v << 8 */
205 or r9, r11, r9 /* t1 = h | t1 */
206 swi r9, r5, 0 /* *(d + 0) = t1 */
207 addi r4, r4, -32 /* n = n - 32 */
208 bneid r4, d_bu1_loop /* while (n) loop */
209 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
210 bri d_block_done
211
212d_block_u2:
213 bsrli r11, r11, 16 /* h = h >> 16 */
214d_bu2_loop:
215 addi r8, r8, -32 /* as = as - 32 */
216 addi r5, r5, -32 /* d = d - 32 */
217 lwi r12, r8, 28 /* v = *(as + 28) */
218 bslli r9, r12, 16 /* t1 = v << 16 */
219 or r9, r11, r9 /* t1 = h | t1 */
220 swi r9, r5, 28 /* *(d + 28) = t1 */
221 bsrli r11, r12, 16 /* h = v >> 16 */
222 lwi r12, r8, 24 /* v = *(as + 24) */
223 bslli r9, r12, 16 /* t1 = v << 16 */
224 or r9, r11, r9 /* t1 = h | t1 */
225 swi r9, r5, 24 /* *(d + 24) = t1 */
226 bsrli r11, r12, 16 /* h = v >> 16 */
227 lwi r12, r8, 20 /* v = *(as + 20) */
228 bslli r9, r12, 16 /* t1 = v << 16 */
229 or r9, r11, r9 /* t1 = h | t1 */
230 swi r9, r5, 20 /* *(d + 20) = t1 */
231 bsrli r11, r12, 16 /* h = v >> 16 */
232 lwi r12, r8, 16 /* v = *(as + 16) */
233 bslli r9, r12, 16 /* t1 = v << 16 */
234 or r9, r11, r9 /* t1 = h | t1 */
235 swi r9, r5, 16 /* *(d + 16) = t1 */
236 bsrli r11, r12, 16 /* h = v >> 16 */
237 lwi r12, r8, 12 /* v = *(as + 12) */
238 bslli r9, r12, 16 /* t1 = v << 16 */
239 or r9, r11, r9 /* t1 = h | t1 */
240 swi r9, r5, 12 /* *(d + 112) = t1 */
241 bsrli r11, r12, 16 /* h = v >> 16 */
242 lwi r12, r8, 8 /* v = *(as + 8) */
243 bslli r9, r12, 16 /* t1 = v << 16 */
244 or r9, r11, r9 /* t1 = h | t1 */
245 swi r9, r5, 8 /* *(d + 8) = t1 */
246 bsrli r11, r12, 16 /* h = v >> 16 */
247 lwi r12, r8, 4 /* v = *(as + 4) */
248 bslli r9, r12, 16 /* t1 = v << 16 */
249 or r9, r11, r9 /* t1 = h | t1 */
250 swi r9, r5, 4 /* *(d + 4) = t1 */
251 bsrli r11, r12, 16 /* h = v >> 16 */
252 lwi r12, r8, 0 /* v = *(as + 0) */
253 bslli r9, r12, 16 /* t1 = v << 16 */
254 or r9, r11, r9 /* t1 = h | t1 */
255 swi r9, r5, 0 /* *(d + 0) = t1 */
256 addi r4, r4, -32 /* n = n - 32 */
257 bneid r4, d_bu2_loop /* while (n) loop */
258 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
259
260d_block_done:
261 addi r4, r0, 4 /* n = 4 */
262 cmpu r4, r4, r7 /* n = c - n (unsigned) */
263 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
264
265d_word_xfer:
266 andi r4, r7, 0xfffffffc /* n = c & ~3 */
267 rsub r5, r4, r5 /* d = d - n */
268 rsub r6, r4, r6 /* s = s - n */
269 rsub r7, r4, r7 /* c = c - n */
270
271 andi r9, r6, 3 /* t1 = s & 3 */
272 /* if temp != 0, unaligned transfers needed */
273 bnei r9, d_word_unaligned
274
275d_word_aligned:
276 addi r4, r4,-4 /* n-- */
277 lw r9, r6, r4 /* t1 = *(s+n) */
278 bneid r4, d_word_aligned /* loop */
279 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
280
281 bri d_word_done
282
283d_word_unaligned:
284 andi r8, r6, 0xfffffffc /* as = s & ~3 */
285 lw r11, r8, r4 /* h = *(as + n) */
286
287 addi r9, r9, -1
288 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
289 addi r9, r9, -1
290 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
291
292d_word_u3:
293 bsrli r11, r11, 8 /* h = h >> 8 */
294d_wu3_loop:
295 addi r4, r4,-4 /* n = n - 4 */
296 lw r12, r8, r4 /* v = *(as + n) */
297 bslli r9, r12, 24 /* t1 = v << 24 */
298 or r9, r11, r9 /* t1 = h | t1 */
299 sw r9, r5, r4 /* *(d + n) = t1 */
300 bneid r4, d_wu3_loop /* while (n) loop */
301 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
302
303 bri d_word_done
304
305d_word_u1:
306 bsrli r11, r11, 24 /* h = h >> 24 */
307d_wu1_loop:
308 addi r4, r4,-4 /* n = n - 4 */
309 lw r12, r8, r4 /* v = *(as + n) */
310 bslli r9, r12, 8 /* t1 = v << 8 */
311 or r9, r11, r9 /* t1 = h | t1 */
312 sw r9, r5, r4 /* *(d + n) = t1 */
313 bneid r4, d_wu1_loop /* while (n) loop */
314 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
315
316 bri d_word_done
317
318d_word_u2:
319 bsrli r11, r11, 16 /* h = h >> 16 */
320d_wu2_loop:
321 addi r4, r4,-4 /* n = n - 4 */
322 lw r12, r8, r4 /* v = *(as + n) */
323 bslli r9, r12, 16 /* t1 = v << 16 */
324 or r9, r11, r9 /* t1 = h | t1 */
325 sw r9, r5, r4 /* *(d + n) = t1 */
326 bneid r4, d_wu2_loop /* while (n) loop */
327 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
328
329d_word_done:
330
331d_xfer_end:
332d_xfer_end_loop:
333 beqi r7, a_done /* while (c) */
334 addi r6, r6, -1 /* s-- */
335 lbui r9, r6, 0 /* t1 = *s */
336 addi r5, r5, -1 /* d-- */
337 sbi r9, r5, 0 /* *d = t1 */
338 brid d_xfer_end_loop /* loop */
339 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
340
341a_done:
342d_done:
343 rtsd r15, 8
344 nop
345
346.size memmove, . - memmove
347.end memmove
348libc_hidden_def(memmove)