blob: 18ed8a81ab2ff5ec69f15b1f5a83f44831dd04ca [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* Optimized version of the standard memset() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2015 Free Software Foundation, Inc.
4 Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5 Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21/* Return: dest
22
23 Inputs:
24 in0: dest
25 in1: value
26 in2: count
27
28 The algorithm is fairly straightforward: set byte by byte until we
29 we get to a 16B-aligned address, then loop on 128 B chunks using an
30 early store as prefetching, then loop on 32B chucks, then clear remaining
31 words, finally clear remaining bytes.
32 Since a stf.spill f0 can store 16B in one go, we use this instruction
33 to get peak speed when value = 0. */
34
35#include <sysdep.h>
36#undef ret
37
38#define dest in0
39#define value in1
40#define cnt in2
41
42#define tmp r31
43#define save_lc r30
44#define ptr0 r29
45#define ptr1 r28
46#define ptr2 r27
47#define ptr3 r26
48#define ptr9 r24
49#define loopcnt r23
50#define linecnt r22
51#define bytecnt r21
52
53#define fvalue f6
54
55// This routine uses only scratch predicate registers (p6 - p15)
56#define p_scr p6 // default register for same-cycle branches
57#define p_nz p7
58#define p_zr p8
59#define p_unalgn p9
60#define p_y p11
61#define p_n p12
62#define p_yy p13
63#define p_nn p14
64
65#define movi0 mov
66
67#define MIN1 15
68#define MIN1P1HALF 8
69#define LINE_SIZE 128
70#define LSIZE_SH 7 // shift amount
71#define PREF_AHEAD 8
72
73#define USE_FLP
74#if defined(USE_INT)
75#define store st8
76#define myval value
77#elif defined(USE_FLP)
78#define store stf8
79#define myval fvalue
80#endif
81
82.align 64
83ENTRY(memset)
84{ .mmi
85 .prologue
86 alloc tmp = ar.pfs, 3, 0, 0, 0
87 lfetch.nt1 [dest]
88 .save ar.lc, save_lc
89 movi0 save_lc = ar.lc
90} { .mmi
91 .body
92 mov ret0 = dest // return value
93 cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
94 cmp.eq p_scr, p0 = cnt, r0
95;; }
96{ .mmi
97 and ptr2 = -(MIN1+1), dest // aligned address
98 and tmp = MIN1, dest // prepare to check for alignment
99 tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
100} { .mib
101 mov ptr1 = dest
102 mux1 value = value, @brcst // create 8 identical bytes in word
103(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
104;; }
105{ .mib
106 cmp.ne p_unalgn, p0 = tmp, r0
107} { .mib // NB: # of bytes to move is 1 higher
108 sub bytecnt = (MIN1+1), tmp // than loopcnt
109 cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
110(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
111;; }
112{ .mmi
113(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
114(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
115(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
116;; }
117{ .mib
118(p_y) add cnt = -8, cnt
119(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
120} { .mib
121(p_y) st8 [ptr2] = value, -4
122(p_n) add ptr2 = 4, ptr2
123;; }
124{ .mib
125(p_yy) add cnt = -4, cnt
126(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
127} { .mib
128(p_yy) st4 [ptr2] = value, -2
129(p_nn) add ptr2 = 2, ptr2
130;; }
131{ .mmi
132 mov tmp = LINE_SIZE+1 // for compare
133(p_y) add cnt = -2, cnt
134(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
135} { .mmi
136 setf.sig fvalue=value // transfer value to FLP side
137(p_y) st2 [ptr2] = value, -1
138(p_n) add ptr2 = 1, ptr2
139;; }
140
141{ .mmi
142(p_yy) st1 [ptr2] = value
143 cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
144} { .mbb
145(p_yy) add cnt = -1, cnt
146(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
147;; }
148
149{ .mib
150 nop.m 0
151 shr.u linecnt = cnt, LSIZE_SH
152(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
153;; }
154
155#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
156 .align 32 // -------- // L1A: store ahead into cache lines; fill later
157#endif
158{ .mmi
159 and tmp = -(LINE_SIZE), cnt // compute end of range
160 mov ptr9 = ptr1 // used for prefetching
161 and cnt = (LINE_SIZE-1), cnt // remainder
162} { .mmi
163 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
164 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
165;; }
166{ .mmi
167(p_scr) add loopcnt = -1, linecnt // start of stores
168 add ptr2 = 8, ptr1 // (beyond prefetch stores)
169 add ptr1 = tmp, ptr1 // first address beyond total
170;; } // range
171{ .mmi
172 add tmp = -1, linecnt // next loop count
173 movi0 ar.lc = loopcnt
174;; }
175.pref_l1a:
176{ .mib
177 store [ptr9] = myval, 128 // Do stores one cache line apart
178 nop.i 0
179 br.cloop.dptk.few .pref_l1a
180;; }
181{ .mmi
182 add ptr0 = 16, ptr2 // Two stores in parallel
183 movi0 ar.lc = tmp
184;; }
185.l1ax:
186 { .mmi
187 store [ptr2] = myval, 8
188 store [ptr0] = myval, 8
189 ;; }
190 { .mmi
191 store [ptr2] = myval, 24
192 store [ptr0] = myval, 24
193 ;; }
194 { .mmi
195 store [ptr2] = myval, 8
196 store [ptr0] = myval, 8
197 ;; }
198 { .mmi
199 store [ptr2] = myval, 24
200 store [ptr0] = myval, 24
201 ;; }
202 { .mmi
203 store [ptr2] = myval, 8
204 store [ptr0] = myval, 8
205 ;; }
206 { .mmi
207 store [ptr2] = myval, 24
208 store [ptr0] = myval, 24
209 ;; }
210 { .mmi
211 store [ptr2] = myval, 8
212 store [ptr0] = myval, 32
213 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
214 ;; }
215{ .mmb
216 store [ptr2] = myval, 24
217(p_scr) store [ptr9] = myval, 128
218 br.cloop.dptk.few .l1ax
219;; }
220{ .mbb
221 cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
222(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
223 br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
224;; }
225
226#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
227 { nop 0 }
228#else
229 .align 32
230#endif
231.l1b: // ------------------ // L1B: store ahead into cache lines; fill later
232{ .mmi
233 and tmp = -(LINE_SIZE), cnt // compute end of range
234 mov ptr9 = ptr1 // used for prefetching
235 and cnt = (LINE_SIZE-1), cnt // remainder
236} { .mmi
237 mov loopcnt = PREF_AHEAD-1 // default prefetch loop
238 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
239;; }
240{ .mmi
241(p_scr) add loopcnt = -1, linecnt
242 add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
243 add ptr1 = tmp, ptr1 // first address beyond total range
244;; }
245{ .mmi
246 add tmp = -1, linecnt // next loop count
247 movi0 ar.lc = loopcnt
248;; }
249.pref_l1b:
250{ .mib
251 stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
252 nop.i 0
253 br.cloop.dptk.few .pref_l1b
254;; }
255{ .mmi
256 add ptr0 = 16, ptr2 // Two stores in parallel
257 movi0 ar.lc = tmp
258;; }
259.l1bx:
260 { .mmi
261 stf.spill [ptr2] = f0, 32
262 stf.spill [ptr0] = f0, 32
263 ;; }
264 { .mmi
265 stf.spill [ptr2] = f0, 32
266 stf.spill [ptr0] = f0, 32
267 ;; }
268 { .mmi
269 stf.spill [ptr2] = f0, 32
270 stf.spill [ptr0] = f0, 64
271 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
272 ;; }
273{ .mmb
274 stf.spill [ptr2] = f0, 32
275(p_scr) stf.spill [ptr9] = f0, 128
276 br.cloop.dptk.few .l1bx
277;; }
278{ .mib
279 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
280(p_scr) br.cond.dpnt.many .move_bytes_from_alignment
281;; }
282
283.fraction_of_line:
284{ .mib
285 add ptr2 = 16, ptr1
286 shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
287;; }
288{ .mib
289 cmp.eq p_scr, p0 = loopcnt, r0
290 add loopcnt = -1, loopcnt
291(p_scr) br.cond.dpnt.many store_words
292;; }
293{ .mib
294 and cnt = 0x1f, cnt // compute the remaining cnt
295 movi0 ar.lc = loopcnt
296;; }
297#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
298 .align 32
299#endif
300.l2: // ---------------------------- // L2A: store 32B in 2 cycles
301{ .mmb
302 store [ptr1] = myval, 8
303 store [ptr2] = myval, 8
304;; } { .mmb
305 store [ptr1] = myval, 24
306 store [ptr2] = myval, 24
307 br.cloop.dptk.many .l2
308;; }
309store_words:
310{ .mib
311 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
312(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
313;; }
314
315{ .mmi
316 store [ptr1] = myval, 8 // store
317 cmp.le p_y, p_n = 16, cnt //
318 add cnt = -8, cnt // subtract
319;; }
320{ .mmi
321(p_y) store [ptr1] = myval, 8 // store
322(p_y) cmp.le.unc p_yy, p_nn = 16, cnt //
323(p_y) add cnt = -8, cnt // subtract
324;; }
325{ .mmi // store
326(p_yy) store [ptr1] = myval, 8 //
327(p_yy) add cnt = -8, cnt // subtract
328;; }
329
330.move_bytes_from_alignment:
331{ .mib
332 cmp.eq p_scr, p0 = cnt, r0
333 tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
334(p_scr) br.cond.dpnt.few .restore_and_exit
335;; }
336{ .mib
337(p_y) st4 [ptr1] = value, 4
338 tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
339;; }
340{ .mib
341(p_yy) st2 [ptr1] = value, 2
342 tbit.nz.unc p_y, p0 = cnt, 0
343;; }
344
345{ .mib
346(p_y) st1 [ptr1] = value
347;; }
348.restore_and_exit:
349{ .mib
350 nop.m 0
351 movi0 ar.lc = save_lc
352 br.ret.sptk.many rp
353;; }
354
355.move_bytes_unaligned:
356{ .mmi
357 .pred.rel "mutex",p_y, p_n
358 .pred.rel "mutex",p_yy, p_nn
359(p_n) cmp.le p_yy, p_nn = 4, cnt
360(p_y) cmp.le p_yy, p_nn = 5, cnt
361(p_n) add ptr2 = 2, ptr1
362} { .mmi
363(p_y) add ptr2 = 3, ptr1
364(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte
365(p_y) add cnt = -1, cnt // [15, 14 (or less) left]
366;; }
367{ .mmi
368(p_yy) cmp.le.unc p_y, p0 = 8, cnt
369 add ptr3 = ptr1, cnt // prepare last store
370 movi0 ar.lc = save_lc
371} { .mmi
372(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
373(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
374(p_yy) add cnt = -4, cnt // [11, 10 (o less) left]
375;; }
376{ .mmi
377(p_y) cmp.le.unc p_yy, p0 = 8, cnt
378 add ptr3 = -1, ptr3 // last store
379 tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
380} { .mmi
381(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
382(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
383(p_y) add cnt = -4, cnt // [7, 6 (or less) left]
384;; }
385{ .mmi
386(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
387(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
388 // [3, 2 (or less) left]
389 tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
390} { .mmi
391(p_yy) add cnt = -4, cnt
392;; }
393{ .mmb
394(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
395(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
396 br.ret.sptk.many rp
397;; }
398END(memset)
399libc_hidden_builtin_def (memset)