blob: 7cf081e8783ba9dbe363935a910d42d290d48b36 [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32 .text
33 .globl memcpy
34 .type memcpy, @function
35 .ent memcpy
36
37memcpy:
38fast_memcpy_ascending:
39 /* move d to return register as value of function */
40 addi r3, r5, 0
41
42 addi r4, r0, 4 /* n = 4 */
43 cmpu r4, r4, r7 /* n = c - n (unsigned) */
44 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
45
46 /* transfer first 0~3 bytes to get aligned dest address */
47 andi r4, r5, 3 /* n = d & 3 */
48 /* if zero, destination already aligned */
49 beqi r4, a_dalign_done
50 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
51 rsubi r4, r4, 4
52 rsub r7, r4, r7 /* c = c - n adjust c */
53
54a_xfer_first_loop:
55 /* if no bytes left to transfer, transfer the bulk */
56 beqi r4, a_dalign_done
57 lbui r11, r6, 0 /* h = *s */
58 sbi r11, r5, 0 /* *d = h */
59 addi r6, r6, 1 /* s++ */
60 addi r5, r5, 1 /* d++ */
61 brid a_xfer_first_loop /* loop */
62 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
63
64a_dalign_done:
65 addi r4, r0, 32 /* n = 32 */
66 cmpu r4, r4, r7 /* n = c - n (unsigned) */
67 /* if n < 0, less than one block to transfer */
68 blti r4, a_block_done
69
70a_block_xfer:
71 andi r9, r6, 3 /* t1 = s & 3 */
72 /* if temp == 0, everything is word-aligned */
73 beqi r9, a_word_xfer
74
75a_block_unaligned:
76 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
77 rsub r7, r4, r7 /* c = c - n */
78 andi r8, r6, 0xfffffffc /* as = s & ~3 */
79 add r6, r6, r4 /* s = s + n */
80 lwi r11, r8, 0 /* h = *(as + 0) */
81
82 addi r9, r9, -1
83 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
84 addi r9, r9, -1
85 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
86
87a_block_u3:
88 bslli r11, r11, 24 /* h = h << 24 */
89a_bu3_loop:
90 lwi r12, r8, 4 /* v = *(as + 4) */
91 bsrli r9, r12, 8 /* t1 = v >> 8 */
92 or r9, r11, r9 /* t1 = h | t1 */
93 swi r9, r5, 0 /* *(d + 0) = t1 */
94 bslli r11, r12, 24 /* h = v << 24 */
95 lwi r12, r8, 8 /* v = *(as + 8) */
96 bsrli r9, r12, 8 /* t1 = v >> 8 */
97 or r9, r11, r9 /* t1 = h | t1 */
98 swi r9, r5, 4 /* *(d + 4) = t1 */
99 bslli r11, r12, 24 /* h = v << 24 */
100 lwi r12, r8, 12 /* v = *(as + 12) */
101 bsrli r9, r12, 8 /* t1 = v >> 8 */
102 or r9, r11, r9 /* t1 = h | t1 */
103 swi r9, r5, 8 /* *(d + 8) = t1 */
104 bslli r11, r12, 24 /* h = v << 24 */
105 lwi r12, r8, 16 /* v = *(as + 16) */
106 bsrli r9, r12, 8 /* t1 = v >> 8 */
107 or r9, r11, r9 /* t1 = h | t1 */
108 swi r9, r5, 12 /* *(d + 12) = t1 */
109 bslli r11, r12, 24 /* h = v << 24 */
110 lwi r12, r8, 20 /* v = *(as + 20) */
111 bsrli r9, r12, 8 /* t1 = v >> 8 */
112 or r9, r11, r9 /* t1 = h | t1 */
113 swi r9, r5, 16 /* *(d + 16) = t1 */
114 bslli r11, r12, 24 /* h = v << 24 */
115 lwi r12, r8, 24 /* v = *(as + 24) */
116 bsrli r9, r12, 8 /* t1 = v >> 8 */
117 or r9, r11, r9 /* t1 = h | t1 */
118 swi r9, r5, 20 /* *(d + 20) = t1 */
119 bslli r11, r12, 24 /* h = v << 24 */
120 lwi r12, r8, 28 /* v = *(as + 28) */
121 bsrli r9, r12, 8 /* t1 = v >> 8 */
122 or r9, r11, r9 /* t1 = h | t1 */
123 swi r9, r5, 24 /* *(d + 24) = t1 */
124 bslli r11, r12, 24 /* h = v << 24 */
125 lwi r12, r8, 32 /* v = *(as + 32) */
126 bsrli r9, r12, 8 /* t1 = v >> 8 */
127 or r9, r11, r9 /* t1 = h | t1 */
128 swi r9, r5, 28 /* *(d + 28) = t1 */
129 bslli r11, r12, 24 /* h = v << 24 */
130 addi r8, r8, 32 /* as = as + 32 */
131 addi r4, r4, -32 /* n = n - 32 */
132 bneid r4, a_bu3_loop /* while (n) loop */
133 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
134 bri a_block_done
135
136a_block_u1:
137 bslli r11, r11, 8 /* h = h << 8 */
138a_bu1_loop:
139 lwi r12, r8, 4 /* v = *(as + 4) */
140 bsrli r9, r12, 24 /* t1 = v >> 24 */
141 or r9, r11, r9 /* t1 = h | t1 */
142 swi r9, r5, 0 /* *(d + 0) = t1 */
143 bslli r11, r12, 8 /* h = v << 8 */
144 lwi r12, r8, 8 /* v = *(as + 8) */
145 bsrli r9, r12, 24 /* t1 = v >> 24 */
146 or r9, r11, r9 /* t1 = h | t1 */
147 swi r9, r5, 4 /* *(d + 4) = t1 */
148 bslli r11, r12, 8 /* h = v << 8 */
149 lwi r12, r8, 12 /* v = *(as + 12) */
150 bsrli r9, r12, 24 /* t1 = v >> 24 */
151 or r9, r11, r9 /* t1 = h | t1 */
152 swi r9, r5, 8 /* *(d + 8) = t1 */
153 bslli r11, r12, 8 /* h = v << 8 */
154 lwi r12, r8, 16 /* v = *(as + 16) */
155 bsrli r9, r12, 24 /* t1 = v >> 24 */
156 or r9, r11, r9 /* t1 = h | t1 */
157 swi r9, r5, 12 /* *(d + 12) = t1 */
158 bslli r11, r12, 8 /* h = v << 8 */
159 lwi r12, r8, 20 /* v = *(as + 20) */
160 bsrli r9, r12, 24 /* t1 = v >> 24 */
161 or r9, r11, r9 /* t1 = h | t1 */
162 swi r9, r5, 16 /* *(d + 16) = t1 */
163 bslli r11, r12, 8 /* h = v << 8 */
164 lwi r12, r8, 24 /* v = *(as + 24) */
165 bsrli r9, r12, 24 /* t1 = v >> 24 */
166 or r9, r11, r9 /* t1 = h | t1 */
167 swi r9, r5, 20 /* *(d + 20) = t1 */
168 bslli r11, r12, 8 /* h = v << 8 */
169 lwi r12, r8, 28 /* v = *(as + 28) */
170 bsrli r9, r12, 24 /* t1 = v >> 24 */
171 or r9, r11, r9 /* t1 = h | t1 */
172 swi r9, r5, 24 /* *(d + 24) = t1 */
173 bslli r11, r12, 8 /* h = v << 8 */
174 lwi r12, r8, 32 /* v = *(as + 32) */
175 bsrli r9, r12, 24 /* t1 = v >> 24 */
176 or r9, r11, r9 /* t1 = h | t1 */
177 swi r9, r5, 28 /* *(d + 28) = t1 */
178 bslli r11, r12, 8 /* h = v << 8 */
179 addi r8, r8, 32 /* as = as + 32 */
180 addi r4, r4, -32 /* n = n - 32 */
181 bneid r4, a_bu1_loop /* while (n) loop */
182 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
183 bri a_block_done
184
185a_block_u2:
186 bslli r11, r11, 16 /* h = h << 16 */
187a_bu2_loop:
188 lwi r12, r8, 4 /* v = *(as + 4) */
189 bsrli r9, r12, 16 /* t1 = v >> 16 */
190 or r9, r11, r9 /* t1 = h | t1 */
191 swi r9, r5, 0 /* *(d + 0) = t1 */
192 bslli r11, r12, 16 /* h = v << 16 */
193 lwi r12, r8, 8 /* v = *(as + 8) */
194 bsrli r9, r12, 16 /* t1 = v >> 16 */
195 or r9, r11, r9 /* t1 = h | t1 */
196 swi r9, r5, 4 /* *(d + 4) = t1 */
197 bslli r11, r12, 16 /* h = v << 16 */
198 lwi r12, r8, 12 /* v = *(as + 12) */
199 bsrli r9, r12, 16 /* t1 = v >> 16 */
200 or r9, r11, r9 /* t1 = h | t1 */
201 swi r9, r5, 8 /* *(d + 8) = t1 */
202 bslli r11, r12, 16 /* h = v << 16 */
203 lwi r12, r8, 16 /* v = *(as + 16) */
204 bsrli r9, r12, 16 /* t1 = v >> 16 */
205 or r9, r11, r9 /* t1 = h | t1 */
206 swi r9, r5, 12 /* *(d + 12) = t1 */
207 bslli r11, r12, 16 /* h = v << 16 */
208 lwi r12, r8, 20 /* v = *(as + 20) */
209 bsrli r9, r12, 16 /* t1 = v >> 16 */
210 or r9, r11, r9 /* t1 = h | t1 */
211 swi r9, r5, 16 /* *(d + 16) = t1 */
212 bslli r11, r12, 16 /* h = v << 16 */
213 lwi r12, r8, 24 /* v = *(as + 24) */
214 bsrli r9, r12, 16 /* t1 = v >> 16 */
215 or r9, r11, r9 /* t1 = h | t1 */
216 swi r9, r5, 20 /* *(d + 20) = t1 */
217 bslli r11, r12, 16 /* h = v << 16 */
218 lwi r12, r8, 28 /* v = *(as + 28) */
219 bsrli r9, r12, 16 /* t1 = v >> 16 */
220 or r9, r11, r9 /* t1 = h | t1 */
221 swi r9, r5, 24 /* *(d + 24) = t1 */
222 bslli r11, r12, 16 /* h = v << 16 */
223 lwi r12, r8, 32 /* v = *(as + 32) */
224 bsrli r9, r12, 16 /* t1 = v >> 16 */
225 or r9, r11, r9 /* t1 = h | t1 */
226 swi r9, r5, 28 /* *(d + 28) = t1 */
227 bslli r11, r12, 16 /* h = v << 16 */
228 addi r8, r8, 32 /* as = as + 32 */
229 addi r4, r4, -32 /* n = n - 32 */
230 bneid r4, a_bu2_loop /* while (n) loop */
231 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
232
233a_block_done:
234 addi r4, r0, 4 /* n = 4 */
235 cmpu r4, r4, r7 /* n = c - n (unsigned) */
236 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
237
238a_word_xfer:
239 andi r4, r7, 0xfffffffc /* n = c & ~3 */
240 addi r10, r0, 0 /* offset = 0 */
241
242 andi r9, r6, 3 /* t1 = s & 3 */
243 /* if temp != 0, unaligned transfers needed */
244 bnei r9, a_word_unaligned
245
246a_word_aligned:
247 lw r9, r6, r10 /* t1 = *(s+offset) */
248 sw r9, r5, r10 /* *(d+offset) = t1 */
249 addi r4, r4,-4 /* n-- */
250 bneid r4, a_word_aligned /* loop */
251 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
252
253 bri a_word_done
254
255a_word_unaligned:
256 andi r8, r6, 0xfffffffc /* as = s & ~3 */
257 lwi r11, r8, 0 /* h = *(as + 0) */
258 addi r8, r8, 4 /* as = as + 4 */
259
260 addi r9, r9, -1
261 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
262 addi r9, r9, -1
263 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
264
265a_word_u3:
266 bslli r11, r11, 24 /* h = h << 24 */
267a_wu3_loop:
268 lw r12, r8, r10 /* v = *(as + offset) */
269 bsrli r9, r12, 8 /* t1 = v >> 8 */
270 or r9, r11, r9 /* t1 = h | t1 */
271 sw r9, r5, r10 /* *(d + offset) = t1 */
272 bslli r11, r12, 24 /* h = v << 24 */
273 addi r4, r4,-4 /* n = n - 4 */
274 bneid r4, a_wu3_loop /* while (n) loop */
275 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
276
277 bri a_word_done
278
279a_word_u1:
280 bslli r11, r11, 8 /* h = h << 8 */
281a_wu1_loop:
282 lw r12, r8, r10 /* v = *(as + offset) */
283 bsrli r9, r12, 24 /* t1 = v >> 24 */
284 or r9, r11, r9 /* t1 = h | t1 */
285 sw r9, r5, r10 /* *(d + offset) = t1 */
286 bslli r11, r12, 8 /* h = v << 8 */
287 addi r4, r4,-4 /* n = n - 4 */
288 bneid r4, a_wu1_loop /* while (n) loop */
289 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
290
291 bri a_word_done
292
293a_word_u2:
294 bslli r11, r11, 16 /* h = h << 16 */
295a_wu2_loop:
296 lw r12, r8, r10 /* v = *(as + offset) */
297 bsrli r9, r12, 16 /* t1 = v >> 16 */
298 or r9, r11, r9 /* t1 = h | t1 */
299 sw r9, r5, r10 /* *(d + offset) = t1 */
300 bslli r11, r12, 16 /* h = v << 16 */
301 addi r4, r4,-4 /* n = n - 4 */
302 bneid r4, a_wu2_loop /* while (n) loop */
303 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
304
305a_word_done:
306 add r5, r5, r10 /* d = d + offset */
307 add r6, r6, r10 /* s = s + offset */
308 rsub r7, r10, r7 /* c = c - offset */
309
310a_xfer_end:
311a_xfer_end_loop:
312 beqi r7, a_done /* while (c) */
313 lbui r9, r6, 0 /* t1 = *s */
314 addi r6, r6, 1 /* s++ */
315 sbi r9, r5, 0 /* *d = t1 */
316 addi r7, r7, -1 /* c-- */
317 brid a_xfer_end_loop /* loop */
318 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
319
320a_done:
321 rtsd r15, 8
322 nop
323
324.size memcpy, . - memcpy
325.end memcpy
326libc_hidden_def(memcpy)