blob: 442f39042611dde08c2cf78d5a3efb1c5d0e5a36 [file] [log] [blame]
xf.libdd93d52023-05-12 07:10:14 -07001/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* Assumptions:
20 *
21 * ARMv8-a, AArch64
22 * Unaligned accesses
23 *
24 */
25
26#define dstin x0
27#define src x1
28#define count x2
29#define tmp1 x3
30#define tmp1w w3
31#define tmp2 x4
32#define tmp2w w4
33#define tmp3 x5
34#define tmp3w w5
35#define dst x6
36
37#define A_l x7
38#define A_h x8
39#define B_l x9
40#define B_h x10
41#define C_l x11
42#define C_h x12
43#define D_l x13
44#define D_h x14
45
46#include <sysdep.h>
47
48ENTRY_ALIGN (memcpy, 6)
49
50 mov dst, dstin
51 cmp count, #64
52 b.ge L(cpy_not_short)
53 cmp count, #15
54 b.le L(tail15tiny)
55
56 /* Deal with small copies quickly by dropping straight into the
57 * exit block. */
58L(tail63):
59 /* Copy up to 48 bytes of data. At this point we only need the
60 * bottom 6 bits of count to be accurate. */
61 ands tmp1, count, #0x30
62 b.eq L(tail15)
63 add dst, dst, tmp1
64 add src, src, tmp1
65 cmp tmp1w, #0x20
66 b.eq 1f
67 b.lt 2f
68 ldp A_l, A_h, [src, #-48]
69 stp A_l, A_h, [dst, #-48]
701:
71 ldp A_l, A_h, [src, #-32]
72 stp A_l, A_h, [dst, #-32]
732:
74 ldp A_l, A_h, [src, #-16]
75 stp A_l, A_h, [dst, #-16]
76
77L(tail15):
78 ands count, count, #15
79 beq 1f
80 add src, src, count
81 ldp A_l, A_h, [src, #-16]
82 add dst, dst, count
83 stp A_l, A_h, [dst, #-16]
841:
85 RET
86
87L(tail15tiny):
88 /* Copy up to 15 bytes of data. Does not assume additional data
89 being copied. */
90 tbz count, #3, 1f
91 ldr tmp1, [src], #8
92 str tmp1, [dst], #8
931:
94 tbz count, #2, 1f
95 ldr tmp1w, [src], #4
96 str tmp1w, [dst], #4
971:
98 tbz count, #1, 1f
99 ldrh tmp1w, [src], #2
100 strh tmp1w, [dst], #2
1011:
102 tbz count, #0, 1f
103 ldrb tmp1w, [src]
104 strb tmp1w, [dst]
1051:
106 RET
107
108L(cpy_not_short):
109 /* We don't much care about the alignment of DST, but we want SRC
110 * to be 128-bit (16 byte) aligned so that we don't cross cache line
111 * boundaries on both loads and stores. */
112 neg tmp2, src
113 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
114 b.eq 2f
115 sub count, count, tmp2
116 /* Copy more data than needed; it's faster than jumping
117 * around copying sub-Quadword quantities. We know that
118 * it can't overrun. */
119 ldp A_l, A_h, [src]
120 add src, src, tmp2
121 stp A_l, A_h, [dst]
122 add dst, dst, tmp2
123 /* There may be less than 63 bytes to go now. */
124 cmp count, #63
125 b.le L(tail63)
1262:
127 subs count, count, #128
128 b.ge L(cpy_body_large)
129 /* Less than 128 bytes to copy, so handle 64 here and then jump
130 * to the tail. */
131 ldp A_l, A_h, [src]
132 ldp B_l, B_h, [src, #16]
133 ldp C_l, C_h, [src, #32]
134 ldp D_l, D_h, [src, #48]
135 stp A_l, A_h, [dst]
136 stp B_l, B_h, [dst, #16]
137 stp C_l, C_h, [dst, #32]
138 stp D_l, D_h, [dst, #48]
139 tst count, #0x3f
140 add src, src, #64
141 add dst, dst, #64
142 b.ne L(tail63)
143 RET
144
145 /* Critical loop. Start at a new cache line boundary. Assuming
146 * 64 bytes per line this ensures the entire loop is in one line. */
147 .p2align 6
148L(cpy_body_large):
149 /* There are at least 128 bytes to copy. */
150 ldp A_l, A_h, [src, #0]
151 sub dst, dst, #16 /* Pre-bias. */
152 ldp B_l, B_h, [src, #16]
153 ldp C_l, C_h, [src, #32]
154 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
1551:
156 stp A_l, A_h, [dst, #16]
157 ldp A_l, A_h, [src, #16]
158 stp B_l, B_h, [dst, #32]
159 ldp B_l, B_h, [src, #32]
160 stp C_l, C_h, [dst, #48]
161 ldp C_l, C_h, [src, #48]
162 stp D_l, D_h, [dst, #64]!
163 ldp D_l, D_h, [src, #64]!
164 subs count, count, #64
165 b.ge 1b
166 stp A_l, A_h, [dst, #16]
167 stp B_l, B_h, [dst, #32]
168 stp C_l, C_h, [dst, #48]
169 stp D_l, D_h, [dst, #64]
170 add src, src, #16
171 add dst, dst, #64 + 16
172 tst count, #0x3f
173 b.ne L(tail63)
174 RET
175END (memcpy)
176libc_hidden_builtin_def (memcpy)