blob: beaada6fc73c11fe8ad2debae12ff5c1d3cd7666 [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 Contributed by Dan Pop <Dan.Pop@cern.ch>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21/* Return: dest
22
23 Inputs:
24 in0: dest
25 in1: src
26 in2: byte count
27
28 The core of the function is the memcpy implementation used in memcpy.S.
29 When bytes have to be copied backwards, only the easy case, when
30 all arguments are multiples of 8, is optimised.
31
32 In this form, it assumes little endian mode. For big endian mode,
33 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34 or the UM.be bit should be cleared at the beginning and set at the end. */
35
36#include "sysdep.h"
37#undef ret
38
39#define OP_T_THRES 16
40#define OPSIZ 8
41
42#define adest r15
43#define saved_pr r17
44#define saved_lc r18
45#define dest r19
46#define src r20
47#define len r21
48#define asrc r22
49#define tmp2 r23
50#define tmp3 r24
51#define tmp4 r25
52#define ptable r26
53#define ploop56 r27
54#define loopaddr r28
55#define sh1 r29
56#define loopcnt r30
57#define value r31
58
59#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
60# define ALIGN(n) { nop 0 }
61#else
62# define ALIGN(n) .align n
63#endif
64
65#define LOOP(shift) \
66 ALIGN(32); \
67.loop##shift : \
68(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
69(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
70(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
71 nop.b 0 ; \
72 nop.b 0 ; \
73 br.ctop.sptk .loop##shift ; \
74 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
75
76#define MEMLAT 21
77#define Nrot (((2*MEMLAT+3) + 7) & ~7)
78
79ENTRY(memmove)
80 .prologue
81 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
82 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
83 .rotp p[MEMLAT + 2]
84 mov ret0 = in0 /* return value = dest */
85 .save pr, saved_pr
86 mov saved_pr = pr /* save the predicate registers */
87 .save ar.lc, saved_lc
88 mov saved_lc = ar.lc /* save the loop counter */
89 .body
90 or tmp3 = in0, in1 ;; /* tmp3 = dest | src */
91 or tmp3 = tmp3, in2 /* tmp3 = dest | src | len */
92 mov dest = in0 /* dest */
93 mov src = in1 /* src */
94 mov len = in2 /* len */
95 sub tmp2 = r0, in0 /* tmp2 = -dest */
96 cmp.eq p6, p0 = in2, r0 /* if (len == 0) */
97(p6) br.cond.spnt .restore_and_exit;;/* return dest; */
98 and tmp4 = 7, tmp3 /* tmp4 = (dest | src | len) & 7 */
99 cmp.le p6, p0 = dest, src /* if dest <= src it's always safe */
100(p6) br.cond.spnt .forward /* to copy forward */
101 add tmp3 = src, len;;
102 cmp.lt p6, p0 = dest, tmp3 /* if dest > src && dest < src + len */
103(p6) br.cond.spnt .backward /* we have to copy backward */
104
105.forward:
106 shr.u loopcnt = len, 4 ;; /* loopcnt = len / 16 */
107 cmp.ne p6, p0 = tmp4, r0 /* if ((dest | src | len) & 7 != 0) */
108(p6) br.cond.sptk .next /* goto next; */
109
110/* The optimal case, when dest, src and len are all multiples of 8 */
111
112 and tmp3 = 0xf, len
113 mov pr.rot = 1 << 16 /* set rotating predicates */
114 mov ar.ec = MEMLAT + 1 ;; /* set the epilog counter */
115 cmp.ne p6, p0 = tmp3, r0 /* do we have to copy an extra word? */
116 adds loopcnt = -1, loopcnt;; /* --loopcnt */
117(p6) ld8 value = [src], 8;;
118(p6) st8 [dest] = value, 8 /* copy the "odd" word */
119 mov ar.lc = loopcnt /* set the loop counter */
120 cmp.eq p6, p0 = 8, len
121(p6) br.cond.spnt .restore_and_exit;;/* the one-word special case */
122 adds adest = 8, dest /* set adest one word ahead of dest */
123 adds asrc = 8, src ;; /* set asrc one word ahead of src */
124 nop.b 0 /* get the "golden" alignment for */
125 nop.b 0 /* the next loop */
126.l0:
127(p[0]) ld8 r[0] = [src], 16
128(p[0]) ld8 q[0] = [asrc], 16
129(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
130(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
131 br.ctop.dptk .l0 ;;
132
133 mov pr = saved_pr, -1 /* restore the predicate registers */
134 mov ar.lc = saved_lc /* restore the loop counter */
135 br.ret.sptk.many b0
136.next:
137 cmp.ge p6, p0 = OP_T_THRES, len /* is len <= OP_T_THRES */
138 and loopcnt = 7, tmp2 /* loopcnt = -dest % 8 */
139(p6) br.cond.spnt .cpyfew /* copy byte by byte */
140 ;;
141 cmp.eq p6, p0 = loopcnt, r0
142(p6) br.cond.sptk .dest_aligned
143 sub len = len, loopcnt /* len -= -dest % 8 */
144 adds loopcnt = -1, loopcnt /* --loopcnt */
145 ;;
146 mov ar.lc = loopcnt
147.l1: /* copy -dest % 8 bytes */
148 ld1 value = [src], 1 /* value = *src++ */
149 ;;
150 st1 [dest] = value, 1 /* *dest++ = value */
151 br.cloop.dptk .l1
152.dest_aligned:
153 and sh1 = 7, src /* sh1 = src % 8 */
154 and tmp2 = -8, len /* tmp2 = len & -OPSIZ */
155 and asrc = -8, src /* asrc = src & -OPSIZ -- align src */
156 shr.u loopcnt = len, 3 /* loopcnt = len / 8 */
157 and len = 7, len;; /* len = len % 8 */
158 adds loopcnt = -1, loopcnt /* --loopcnt */
159 addl tmp4 = @ltoff(.table), gp
160 addl tmp3 = @ltoff(.loop56), gp
161 mov ar.ec = MEMLAT + 1 /* set EC */
162 mov pr.rot = 1 << 16;; /* set rotating predicates */
163 mov ar.lc = loopcnt /* set LC */
164 cmp.eq p6, p0 = sh1, r0 /* is the src aligned? */
165(p6) br.cond.sptk .src_aligned
166 add src = src, tmp2 /* src += len & -OPSIZ */
167 shl sh1 = sh1, 3 /* sh1 = 8 * (src % 8) */
168 ld8 ploop56 = [tmp3] /* ploop56 = &loop56 */
169 ld8 ptable = [tmp4];; /* ptable = &table */
170 add tmp3 = ptable, sh1;; /* tmp3 = &table + sh1 */
171 mov ar.ec = MEMLAT + 1 + 1 /* one more pass needed */
172 ld8 tmp4 = [tmp3];; /* tmp4 = loop offset */
173 sub loopaddr = ploop56,tmp4 /* loopadd = &loop56 - loop offset */
174 ld8 r[1] = [asrc], 8;; /* w0 */
175 mov b6 = loopaddr;;
176 br b6 /* jump to the appropriate loop */
177
178 LOOP(8)
179 LOOP(16)
180 LOOP(24)
181 LOOP(32)
182 LOOP(40)
183 LOOP(48)
184 LOOP(56)
185
186.src_aligned:
187.l3:
188(p[0]) ld8 r[0] = [src], 8
189(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
190 br.ctop.dptk .l3
191.cpyfew:
192 cmp.eq p6, p0 = len, r0 /* is len == 0 ? */
193 adds len = -1, len /* --len; */
194(p6) br.cond.spnt .restore_and_exit ;;
195 mov ar.lc = len
196.l4:
197 ld1 value = [src], 1
198 ;;
199 st1 [dest] = value, 1
200 br.cloop.dptk .l4 ;;
201.restore_and_exit:
202 mov pr = saved_pr, -1 /* restore the predicate registers */
203 mov ar.lc = saved_lc /* restore the loop counter */
204 br.ret.sptk.many b0
205
206/* In the case of a backward copy, optimise only the case when everything
207 is a multiple of 8, otherwise copy byte by byte. The backward copy is
208 used only when the blocks are overlapping and dest > src.
209*/
210.backward:
211 shr.u loopcnt = len, 3 /* loopcnt = len / 8 */
212 add src = src, len /* src points one byte past the end */
213 add dest = dest, len ;; /* dest points one byte past the end */
214 mov ar.ec = MEMLAT + 1 /* set the epilog counter */
215 mov pr.rot = 1 << 16 /* set rotating predicates */
216 adds loopcnt = -1, loopcnt /* --loopcnt */
217 cmp.ne p6, p0 = tmp4, r0 /* if ((dest | src | len) & 7 != 0) */
218(p6) br.cond.sptk .bytecopy ;; /* copy byte by byte backward */
219 adds src = -8, src /* src points to the last word */
220 adds dest = -8, dest /* dest points to the last word */
221 mov ar.lc = loopcnt;; /* set the loop counter */
222.l5:
223(p[0]) ld8 r[0] = [src], -8
224(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
225 br.ctop.dptk .l5
226 br.cond.sptk .restore_and_exit
227.bytecopy:
228 adds src = -1, src /* src points to the last byte */
229 adds dest = -1, dest /* dest points to the last byte */
230 adds loopcnt = -1, len;; /* loopcnt = len - 1 */
231 mov ar.lc = loopcnt;; /* set the loop counter */
232.l6:
233(p[0]) ld1 r[0] = [src], -1
234(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
235 br.ctop.dptk .l6
236 br.cond.sptk .restore_and_exit
237END(memmove)
238
239 .rodata
240 .align 8
241.table:
242 data8 0 /* dummy entry */
243 data8 .loop56 - .loop8
244 data8 .loop56 - .loop16
245 data8 .loop56 - .loop24
246 data8 .loop56 - .loop32
247 data8 .loop56 - .loop40
248 data8 .loop56 - .loop48
249 data8 .loop56 - .loop56
250
251libc_hidden_def (memmove)