blob: 28846fbf0e2d5f487fdd031dada7583170a93d3b [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* strcpy/stpcpy - copy a string returning pointer to start/end.
2 Copyright (C) 2013-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
20
21 To test the page crossing code path more thoroughly, compile with
22 -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
23 the slower entry path. This option is not intended for production use. */
24
25#include <sysdep.h>
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
30 */
31
32/* Arguments and results. */
33#define dstin x0
34#define srcin x1
35
36/* Locals and temporaries. */
37#define src x2
38#define dst x3
39#define data1 x4
40#define data1w w4
41#define data2 x5
42#define data2w w5
43#define has_nul1 x6
44#define has_nul2 x7
45#define tmp1 x8
46#define tmp2 x9
47#define tmp3 x10
48#define tmp4 x11
49#define zeroones x12
50#define data1a x13
51#define data2a x14
52#define pos x15
53#define len x16
54#define to_align x17
55
56#ifdef BUILD_STPCPY
57#define STRCPY __stpcpy
58#else
59#define STRCPY strcpy
60#endif
61
62 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64 can be done in parallel across the entire word. */
65
66#define REP8_01 0x0101010101010101
67#define REP8_7f 0x7f7f7f7f7f7f7f7f
68#define REP8_80 0x8080808080808080
69
70 /* AArch64 systems have a minimum page size of 4k. We can do a quick
71 page size check for crossing this boundary on entry and if we
72 do not, then we can short-circuit much of the entry code. We
73 expect early page-crossing strings to be rare (probability of
74 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
75 predictable, even with random strings.
76
77 We don't bother checking for larger page sizes, the cost of setting
78 up the correct page size is just not worth the extra gain from
79 a small reduction in the cases taking the slow path. Note that
80 we only care about whether the first fetch, which may be
81 misaligned, crosses a page boundary - after that we move to aligned
82 fetches for the remainder of the string. */
83
84#ifdef STRCPY_TEST_PAGE_CROSS
85 /* Make everything that isn't Qword aligned look like a page cross. */
86#define MIN_PAGE_P2 4
87#else
88#define MIN_PAGE_P2 12
89#endif
90
91#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
92
93ENTRY_ALIGN (STRCPY, 6)
94 /* For moderately short strings, the fastest way to do the copy is to
95 calculate the length of the string in the same way as strlen, then
96 essentially do a memcpy of the result. This avoids the need for
97 multiple byte copies and further means that by the time we
98 reach the bulk copy loop we know we can always use DWord
99 accesses. We expect strcpy to rarely be called repeatedly
100 with the same source string, so branch prediction is likely to
101 always be difficult - we mitigate against this by preferring
102 conditional select operations over branches whenever this is
103 feasible. */
104 and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
105 mov zeroones, #REP8_01
106 and to_align, srcin, #15
107 cmp tmp2, #(MIN_PAGE_SIZE - 16)
108 neg tmp1, to_align
109 /* The first fetch will straddle a (possible) page boundary iff
110 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
111 aligned string will never fail the page align check, so will
112 always take the fast path. */
113 b.gt L(page_cross)
114
115L(page_cross_ok):
116 ldp data1, data2, [srcin]
117#ifdef __AARCH64EB__
118 /* Because we expect the end to be found within 16 characters
119 (profiling shows this is the most common case), it's worth
120 swapping the bytes now to save having to recalculate the
121 termination syndrome later. We preserve data1 and data2
122 so that we can re-use the values later on. */
123 rev tmp2, data1
124 sub tmp1, tmp2, zeroones
125 orr tmp2, tmp2, #REP8_7f
126 bics has_nul1, tmp1, tmp2
127 b.ne L(fp_le8)
128 rev tmp4, data2
129 sub tmp3, tmp4, zeroones
130 orr tmp4, tmp4, #REP8_7f
131#else
132 sub tmp1, data1, zeroones
133 orr tmp2, data1, #REP8_7f
134 bics has_nul1, tmp1, tmp2
135 b.ne L(fp_le8)
136 sub tmp3, data2, zeroones
137 orr tmp4, data2, #REP8_7f
138#endif
139 bics has_nul2, tmp3, tmp4
140 b.eq L(bulk_entry)
141
142 /* The string is short (<=16 bytes). We don't know exactly how
143 short though, yet. Work out the exact length so that we can
144 quickly select the optimal copy strategy. */
145L(fp_gt8):
146 rev has_nul2, has_nul2
147 clz pos, has_nul2
148 mov tmp2, #56
149 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
150 sub pos, tmp2, pos
151#ifdef __AARCH64EB__
152 lsr data2, data2, pos
153#else
154 lsl data2, data2, pos
155#endif
156 str data2, [dst, #1]
157 str data1, [dstin]
158#ifdef BUILD_STPCPY
159 add dstin, dst, #8
160#endif
161 ret
162
163L(fp_le8):
164 rev has_nul1, has_nul1
165 clz pos, has_nul1
166 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
167 subs tmp2, pos, #24 /* Pos in bits. */
168 b.lt L(fp_lt4)
169#ifdef __AARCH64EB__
170 mov tmp2, #56
171 sub pos, tmp2, pos
172 lsr data2, data1, pos
173 lsr data1, data1, #32
174#else
175 lsr data2, data1, tmp2
176#endif
177 /* 4->7 bytes to copy. */
178 str data2w, [dst, #-3]
179 str data1w, [dstin]
180#ifdef BUILD_STPCPY
181 mov dstin, dst
182#endif
183 ret
184L(fp_lt4):
185 cbz pos, L(fp_lt2)
186 /* 2->3 bytes to copy. */
187#ifdef __AARCH64EB__
188 lsr data1, data1, #48
189#endif
190 strh data1w, [dstin]
191 /* Fall-through, one byte (max) to go. */
192L(fp_lt2):
193 /* Null-terminated string. Last character must be zero! */
194 strb wzr, [dst]
195#ifdef BUILD_STPCPY
196 mov dstin, dst
197#endif
198 ret
199
200 .p2align 6
201 /* Aligning here ensures that the entry code and main loop all lies
202 within one 64-byte cache line. */
203L(bulk_entry):
204 sub to_align, to_align, #16
205 stp data1, data2, [dstin]
206 sub src, srcin, to_align
207 sub dst, dstin, to_align
208 b L(entry_no_page_cross)
209
210 /* The inner loop deals with two Dwords at a time. This has a
211 slightly higher start-up cost, but we should win quite quickly,
212 especially on cores with a high number of issue slots per
213 cycle, as we get much better parallelism out of the operations. */
214L(main_loop):
215 stp data1, data2, [dst], #16
216L(entry_no_page_cross):
217 ldp data1, data2, [src], #16
218 sub tmp1, data1, zeroones
219 orr tmp2, data1, #REP8_7f
220 sub tmp3, data2, zeroones
221 orr tmp4, data2, #REP8_7f
222 bic has_nul1, tmp1, tmp2
223 bics has_nul2, tmp3, tmp4
224 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
225 b.eq L(main_loop)
226
227 /* Since we know we are copying at least 16 bytes, the fastest way
228 to deal with the tail is to determine the location of the
229 trailing NUL, then (re)copy the 16 bytes leading up to that. */
230 cmp has_nul1, #0
231#ifdef __AARCH64EB__
232 /* For big-endian, carry propagation (if the final byte in the
233 string is 0x01) means we cannot use has_nul directly. The
234 easiest way to get the correct byte is to byte-swap the data
235 and calculate the syndrome a second time. */
236 csel data1, data1, data2, ne
237 rev data1, data1
238 sub tmp1, data1, zeroones
239 orr tmp2, data1, #REP8_7f
240 bic has_nul1, tmp1, tmp2
241#else
242 csel has_nul1, has_nul1, has_nul2, ne
243#endif
244 rev has_nul1, has_nul1
245 clz pos, has_nul1
246 add tmp1, pos, #72
247 add pos, pos, #8
248 csel pos, pos, tmp1, ne
249 add src, src, pos, lsr #3
250 add dst, dst, pos, lsr #3
251 ldp data1, data2, [src, #-32]
252 stp data1, data2, [dst, #-16]
253#ifdef BUILD_STPCPY
254 sub dstin, dst, #1
255#endif
256 ret
257
258L(page_cross):
259 bic src, srcin, #15
260 /* Start by loading two words at [srcin & ~15], then forcing the
261 bytes that precede srcin to 0xff. This means they never look
262 like termination bytes. */
263 ldp data1, data2, [src]
264 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
265 tst to_align, #7
266 csetm tmp2, ne
267#ifdef __AARCH64EB__
268 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
269#else
270 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
271#endif
272 orr data1, data1, tmp2
273 orr data2a, data2, tmp2
274 cmp to_align, #8
275 csinv data1, data1, xzr, lt
276 csel data2, data2, data2a, lt
277 sub tmp1, data1, zeroones
278 orr tmp2, data1, #REP8_7f
279 sub tmp3, data2, zeroones
280 orr tmp4, data2, #REP8_7f
281 bic has_nul1, tmp1, tmp2
282 bics has_nul2, tmp3, tmp4
283 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
284 b.eq L(page_cross_ok)
285 /* We now need to make data1 and data2 look like they've been
286 loaded directly from srcin. Do a rotate on the 128-bit value. */
287 lsl tmp1, to_align, #3 /* Bytes->bits. */
288 neg tmp2, to_align, lsl #3
289#ifdef __AARCH64EB__
290 lsl data1a, data1, tmp1
291 lsr tmp4, data2, tmp2
292 lsl data2, data2, tmp1
293 orr tmp4, tmp4, data1a
294 cmp to_align, #8
295 csel data1, tmp4, data2, lt
296 rev tmp2, data1
297 rev tmp4, data2
298 sub tmp1, tmp2, zeroones
299 orr tmp2, tmp2, #REP8_7f
300 sub tmp3, tmp4, zeroones
301 orr tmp4, tmp4, #REP8_7f
302#else
303 lsr data1a, data1, tmp1
304 lsl tmp4, data2, tmp2
305 lsr data2, data2, tmp1
306 orr tmp4, tmp4, data1a
307 cmp to_align, #8
308 csel data1, tmp4, data2, lt
309 sub tmp1, data1, zeroones
310 orr tmp2, data1, #REP8_7f
311 sub tmp3, data2, zeroones
312 orr tmp4, data2, #REP8_7f
313#endif
314 bic has_nul1, tmp1, tmp2
315 cbnz has_nul1, L(fp_le8)
316 bic has_nul2, tmp3, tmp4
317 b L(fp_gt8)
318END (STRCPY)
319
320#ifdef BUILD_STPCPY
321weak_alias (__stpcpy, stpcpy)
322libc_hidden_def (__stpcpy)
323libc_hidden_builtin_def (stpcpy)
324#else
325libc_hidden_builtin_def (strcpy)
326#endif