blob: fc04c023ee45304748cc183c12d12ce0c134713f [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* Optimized memcpy for Xtensa.
2 Copyright (C) 2001, 2007 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
18 Boston, MA 02110-1301, USA. */
19
20#include "../../sysdeps/linux/xtensa/sysdep.h"
21#include <bits/xtensa-config.h>
22
23 .macro src_b r, w0, w1
24#ifdef __XTENSA_EB__
25 src \r, \w0, \w1
26#else
27 src \r, \w1, \w0
28#endif
29 .endm
30
31 .macro ssa8 r
32#ifdef __XTENSA_EB__
33 ssa8b \r
34#else
35 ssa8l \r
36#endif
37 .endm
38
39/* If the Xtensa Unaligned Load Exception option is not used, this
40 code can run a few cycles faster by relying on the low address bits
41 being ignored. However, if the code is then run with an Xtensa ISS
42 client that checks for unaligned accesses, it will produce a lot of
43 warning messages. Set this flag to disable the use of unaligned
44 accesses and keep the ISS happy. */
45
46#define UNALIGNED_ADDRESSES_CHECKED 1
47
48/* Do not use .literal_position in the ENTRY macro. */
49#undef LITERAL_POSITION
50#define LITERAL_POSITION
51
52
53/* void *memcpy (void *dst, const void *src, size_t len)
54
55 The algorithm is as follows:
56
57 If the destination is unaligned, align it by conditionally
58 copying 1- and/or 2-byte pieces.
59
60 If the source is aligned, copy 16 bytes with a loop, and then finish up
61 with 8, 4, 2, and 1-byte copies conditional on the length.
62
63 Else (if source is unaligned), do the same, but use SRC to align the
64 source data.
65
66 This code tries to use fall-through branches for the common
67 case of aligned source and destination and multiple of 4 (or 8) length. */
68
69
70/* Byte by byte copy. */
71
72 .text
73 .align 4
74 .literal_position
75__memcpy_aux:
76
77 /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
78 (0 mod 4 alignment for LBEG). */
79 .byte 0
80
81.Lbytecopy:
82#if XCHAL_HAVE_LOOPS
83 loopnez a4, 2f
84#else
85 beqz a4, 2f
86 add a7, a3, a4 /* a7 = end address for source */
87#endif
881: l8ui a6, a3, 0
89 addi a3, a3, 1
90 s8i a6, a5, 0
91 addi a5, a5, 1
92#if !XCHAL_HAVE_LOOPS
93 blt a3, a7, 1b
94#endif
952: retw
96
97
98/* Destination is unaligned. */
99
100 .align 4
101.Ldst1mod2: /* dst is only byte aligned */
102
103 /* Do short copies byte-by-byte. */
104 _bltui a4, 7, .Lbytecopy
105
106 /* Copy 1 byte. */
107 l8ui a6, a3, 0
108 addi a3, a3, 1
109 addi a4, a4, -1
110 s8i a6, a5, 0
111 addi a5, a5, 1
112
113 /* Return to main algorithm if dst is now aligned. */
114 _bbci.l a5, 1, .Ldstaligned
115
116.Ldst2mod4: /* dst has 16-bit alignment */
117
118 /* Do short copies byte-by-byte. */
119 _bltui a4, 6, .Lbytecopy
120
121 /* Copy 2 bytes. */
122 l8ui a6, a3, 0
123 l8ui a7, a3, 1
124 addi a3, a3, 2
125 addi a4, a4, -2
126 s8i a6, a5, 0
127 s8i a7, a5, 1
128 addi a5, a5, 2
129
130 /* dst is now aligned; return to main algorithm. */
131 j .Ldstaligned
132
133
134ENTRY (memcpy)
135 /* a2 = dst, a3 = src, a4 = len */
136
137 mov a5, a2 /* copy dst so that a2 is return value */
138 _bbsi.l a2, 0, .Ldst1mod2
139 _bbsi.l a2, 1, .Ldst2mod4
140.Ldstaligned:
141
142 /* Get number of loop iterations with 16B per iteration. */
143 srli a7, a4, 4
144
145 /* Check if source is aligned. */
146 movi a8, 3
147 _bany a3, a8, .Lsrcunaligned
148
149 /* Destination and source are word-aligned, use word copy. */
150#if XCHAL_HAVE_LOOPS
151 loopnez a7, 2f
152#else
153 beqz a7, 2f
154 slli a8, a7, 4
155 add a8, a8, a3 /* a8 = end of last 16B source chunk */
156#endif
1571: l32i a6, a3, 0
158 l32i a7, a3, 4
159 s32i a6, a5, 0
160 l32i a6, a3, 8
161 s32i a7, a5, 4
162 l32i a7, a3, 12
163 s32i a6, a5, 8
164 addi a3, a3, 16
165 s32i a7, a5, 12
166 addi a5, a5, 16
167#if !XCHAL_HAVE_LOOPS
168 blt a3, a8, 1b
169#endif
170
171 /* Copy any leftover pieces smaller than 16B. */
1722: bbci.l a4, 3, 3f
173
174 /* Copy 8 bytes. */
175 l32i a6, a3, 0
176 l32i a7, a3, 4
177 addi a3, a3, 8
178 s32i a6, a5, 0
179 s32i a7, a5, 4
180 addi a5, a5, 8
181
1823: bbsi.l a4, 2, 4f
183 bbsi.l a4, 1, 5f
184 bbsi.l a4, 0, 6f
185 retw
186
187 /* Copy 4 bytes. */
1884: l32i a6, a3, 0
189 addi a3, a3, 4
190 s32i a6, a5, 0
191 addi a5, a5, 4
192 bbsi.l a4, 1, 5f
193 bbsi.l a4, 0, 6f
194 retw
195
196 /* Copy 2 bytes. */
1975: l16ui a6, a3, 0
198 addi a3, a3, 2
199 s16i a6, a5, 0
200 addi a5, a5, 2
201 bbsi.l a4, 0, 6f
202 retw
203
204 /* Copy 1 byte. */
2056: l8ui a6, a3, 0
206 s8i a6, a5, 0
207
208.Ldone:
209 retw
210
211
212/* Destination is aligned; source is unaligned. */
213
214 .align 4
215.Lsrcunaligned:
216 /* Avoid loading anything for zero-length copies. */
217 _beqz a4, .Ldone
218
219 /* Copy 16 bytes per iteration for word-aligned dst and
220 unaligned src. */
221 ssa8 a3 /* set shift amount from byte offset */
222#if UNALIGNED_ADDRESSES_CHECKED
223 and a11, a3, a8 /* save unalignment offset for below */
224 sub a3, a3, a11 /* align a3 */
225#endif
226 l32i a6, a3, 0 /* load first word */
227#if XCHAL_HAVE_LOOPS
228 loopnez a7, 2f
229#else
230 beqz a7, 2f
231 slli a10, a7, 4
232 add a10, a10, a3 /* a10 = end of last 16B source chunk */
233#endif
2341: l32i a7, a3, 4
235 l32i a8, a3, 8
236 src_b a6, a6, a7
237 s32i a6, a5, 0
238 l32i a9, a3, 12
239 src_b a7, a7, a8
240 s32i a7, a5, 4
241 l32i a6, a3, 16
242 src_b a8, a8, a9
243 s32i a8, a5, 8
244 addi a3, a3, 16
245 src_b a9, a9, a6
246 s32i a9, a5, 12
247 addi a5, a5, 16
248#if !XCHAL_HAVE_LOOPS
249 blt a3, a10, 1b
250#endif
251
2522: bbci.l a4, 3, 3f
253
254 /* Copy 8 bytes. */
255 l32i a7, a3, 4
256 l32i a8, a3, 8
257 src_b a6, a6, a7
258 s32i a6, a5, 0
259 addi a3, a3, 8
260 src_b a7, a7, a8
261 s32i a7, a5, 4
262 addi a5, a5, 8
263 mov a6, a8
264
2653: bbci.l a4, 2, 4f
266
267 /* Copy 4 bytes. */
268 l32i a7, a3, 4
269 addi a3, a3, 4
270 src_b a6, a6, a7
271 s32i a6, a5, 0
272 addi a5, a5, 4
273 mov a6, a7
2744:
275#if UNALIGNED_ADDRESSES_CHECKED
276 add a3, a3, a11 /* readjust a3 with correct misalignment */
277#endif
278 bbsi.l a4, 1, 5f
279 bbsi.l a4, 0, 6f
280 retw
281
282 /* Copy 2 bytes. */
2835: l8ui a6, a3, 0
284 l8ui a7, a3, 1
285 addi a3, a3, 2
286 s8i a6, a5, 0
287 s8i a7, a5, 1
288 addi a5, a5, 2
289 bbsi.l a4, 0, 6f
290 retw
291
292 /* Copy 1 byte. */
2936: l8ui a6, a3, 0
294 s8i a6, a5, 0
295 retw
296
297libc_hidden_def (memcpy)