blob: 528fe0dd9339f99b318bfd62615193f98ef94536 [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IP/TCP/UDP checksumming routines
7 *
8 * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9 * Optimized by Joe Taylor
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16
17#include <linux/errno.h>
18#include <linux/linkage.h>
19#include <variant/core.h>
20#include <asm/asmmacro.h>
21
22/*
23 * computes a partial checksum, e.g. for TCP/UDP fragments
24 */
25
26/*
27 * unsigned int csum_partial(const unsigned char *buf, int len,
28 * unsigned int sum);
29 * a2 = buf
30 * a3 = len
31 * a4 = sum
32 *
33 * This function assumes 2- or 4-byte alignment. Other alignments will fail!
34 */
35
36/* ONES_ADD converts twos-complement math to ones-complement. */
37#define ONES_ADD(sum, val) \
38 add sum, sum, val ; \
39 bgeu sum, val, 99f ; \
40 addi sum, sum, 1 ; \
4199: ;
42
43.text
44ENTRY(csum_partial)
45
46 /*
47 * Experiments with Ethernet and SLIP connections show that buf
48 * is aligned on either a 2-byte or 4-byte boundary.
49 */
50 entry sp, 32
51 extui a5, a2, 0, 2
52 bnez a5, 8f /* branch if 2-byte aligned */
53 /* Fall-through on common case, 4-byte alignment */
541:
55 srli a5, a3, 5 /* 32-byte chunks */
56#if XCHAL_HAVE_LOOPS
57 loopgtz a5, 2f
58#else
59 beqz a5, 2f
60 slli a5, a5, 5
61 add a5, a5, a2 /* a5 = end of last 32-byte chunk */
62.Loop1:
63#endif
64 l32i a6, a2, 0
65 l32i a7, a2, 4
66 ONES_ADD(a4, a6)
67 ONES_ADD(a4, a7)
68 l32i a6, a2, 8
69 l32i a7, a2, 12
70 ONES_ADD(a4, a6)
71 ONES_ADD(a4, a7)
72 l32i a6, a2, 16
73 l32i a7, a2, 20
74 ONES_ADD(a4, a6)
75 ONES_ADD(a4, a7)
76 l32i a6, a2, 24
77 l32i a7, a2, 28
78 ONES_ADD(a4, a6)
79 ONES_ADD(a4, a7)
80 addi a2, a2, 4*8
81#if !XCHAL_HAVE_LOOPS
82 blt a2, a5, .Loop1
83#endif
842:
85 extui a5, a3, 2, 3 /* remaining 4-byte chunks */
86#if XCHAL_HAVE_LOOPS
87 loopgtz a5, 3f
88#else
89 beqz a5, 3f
90 slli a5, a5, 2
91 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
92.Loop2:
93#endif
94 l32i a6, a2, 0
95 ONES_ADD(a4, a6)
96 addi a2, a2, 4
97#if !XCHAL_HAVE_LOOPS
98 blt a2, a5, .Loop2
99#endif
1003:
101 _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
102 l16ui a6, a2, 0
103 ONES_ADD(a4, a6)
104 addi a2, a2, 2
1055:
106 _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
1076: l8ui a6, a2, 0
108#ifdef __XTENSA_EB__
109 slli a6, a6, 8 /* load byte into bits 8..15 */
110#endif
111 ONES_ADD(a4, a6)
1127:
113 mov a2, a4
114 retw
115
116 /* uncommon case, buf is 2-byte aligned */
1178:
118 beqz a3, 7b /* branch if len == 0 */
119 beqi a3, 1, 6b /* branch if len == 1 */
120
121 extui a5, a2, 0, 1
122 bnez a5, 8f /* branch if 1-byte aligned */
123
124 l16ui a6, a2, 0 /* common case, len >= 2 */
125 ONES_ADD(a4, a6)
126 addi a2, a2, 2 /* adjust buf */
127 addi a3, a3, -2 /* adjust len */
128 j 1b /* now buf is 4-byte aligned */
129
130 /* case: odd-byte aligned, len > 1
131 * This case is dog slow, so don't give us an odd address.
132 * (I don't think this ever happens, but just in case.)
133 */
1348:
135 srli a5, a3, 2 /* 4-byte chunks */
136#if XCHAL_HAVE_LOOPS
137 loopgtz a5, 2f
138#else
139 beqz a5, 2f
140 slli a5, a5, 2
141 add a5, a5, a2 /* a5 = end of last 4-byte chunk */
142.Loop3:
143#endif
144 l8ui a6, a2, 0 /* bits 24..31 */
145 l16ui a7, a2, 1 /* bits 8..23 */
146 l8ui a8, a2, 3 /* bits 0.. 8 */
147#ifdef __XTENSA_EB__
148 slli a6, a6, 24
149#else
150 slli a8, a8, 24
151#endif
152 slli a7, a7, 8
153 or a7, a7, a6
154 or a7, a7, a8
155 ONES_ADD(a4, a7)
156 addi a2, a2, 4
157#if !XCHAL_HAVE_LOOPS
158 blt a2, a5, .Loop3
159#endif
1602:
161 _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
162 l8ui a6, a2, 0
163 l8ui a7, a2, 1
164#ifdef __XTENSA_EB__
165 slli a6, a6, 8
166#else
167 slli a7, a7, 8
168#endif
169 or a7, a7, a6
170 ONES_ADD(a4, a7)
171 addi a2, a2, 2
1723:
173 j 5b /* branch to handle the remaining byte */
174
175ENDPROC(csum_partial)
176
177/*
178 * Copy from ds while checksumming, otherwise like csum_partial
179 */
180
181/*
182unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
183 int sum, int *src_err_ptr, int *dst_err_ptr)
184 a2 = src
185 a3 = dst
186 a4 = len
187 a5 = sum
188 a6 = src_err_ptr
189 a7 = dst_err_ptr
190 a8 = temp
191 a9 = temp
192 a10 = temp
193 a11 = original len for exception handling
194 a12 = original dst for exception handling
195
196 This function is optimized for 4-byte aligned addresses. Other
197 alignments work, but not nearly as efficiently.
198 */
199
200ENTRY(csum_partial_copy_generic)
201
202 entry sp, 32
203 mov a12, a3
204 mov a11, a4
205 or a10, a2, a3
206
207 /* We optimize the following alignment tests for the 4-byte
208 aligned case. Two bbsi.l instructions might seem more optimal
209 (commented out below). However, both labels 5: and 3: are out
210 of the imm8 range, so the assembler relaxes them into
211 equivalent bbci.l, j combinations, which is actually
212 slower. */
213
214 extui a9, a10, 0, 2
215 beqz a9, 1f /* branch if both are 4-byte aligned */
216 bbsi.l a10, 0, 5f /* branch if one address is odd */
217 j 3f /* one address is 2-byte aligned */
218
219/* _bbsi.l a10, 0, 5f */ /* branch if odd address */
220/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
221
2221:
223 /* src and dst are both 4-byte aligned */
224 srli a10, a4, 5 /* 32-byte chunks */
225#if XCHAL_HAVE_LOOPS
226 loopgtz a10, 2f
227#else
228 beqz a10, 2f
229 slli a10, a10, 5
230 add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
231.Loop5:
232#endif
233EX(10f) l32i a9, a2, 0
234EX(10f) l32i a8, a2, 4
235EX(11f) s32i a9, a3, 0
236EX(11f) s32i a8, a3, 4
237 ONES_ADD(a5, a9)
238 ONES_ADD(a5, a8)
239EX(10f) l32i a9, a2, 8
240EX(10f) l32i a8, a2, 12
241EX(11f) s32i a9, a3, 8
242EX(11f) s32i a8, a3, 12
243 ONES_ADD(a5, a9)
244 ONES_ADD(a5, a8)
245EX(10f) l32i a9, a2, 16
246EX(10f) l32i a8, a2, 20
247EX(11f) s32i a9, a3, 16
248EX(11f) s32i a8, a3, 20
249 ONES_ADD(a5, a9)
250 ONES_ADD(a5, a8)
251EX(10f) l32i a9, a2, 24
252EX(10f) l32i a8, a2, 28
253EX(11f) s32i a9, a3, 24
254EX(11f) s32i a8, a3, 28
255 ONES_ADD(a5, a9)
256 ONES_ADD(a5, a8)
257 addi a2, a2, 32
258 addi a3, a3, 32
259#if !XCHAL_HAVE_LOOPS
260 blt a2, a10, .Loop5
261#endif
2622:
263 extui a10, a4, 2, 3 /* remaining 4-byte chunks */
264 extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
265#if XCHAL_HAVE_LOOPS
266 loopgtz a10, 3f
267#else
268 beqz a10, 3f
269 slli a10, a10, 2
270 add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
271.Loop6:
272#endif
273EX(10f) l32i a9, a2, 0
274EX(11f) s32i a9, a3, 0
275 ONES_ADD(a5, a9)
276 addi a2, a2, 4
277 addi a3, a3, 4
278#if !XCHAL_HAVE_LOOPS
279 blt a2, a10, .Loop6
280#endif
2813:
282 /*
283 Control comes to here in two cases: (1) It may fall through
284 to here from the 4-byte alignment case to process, at most,
285 one 2-byte chunk. (2) It branches to here from above if
286 either src or dst is 2-byte aligned, and we process all bytes
287 here, except for perhaps a trailing odd byte. It's
288 inefficient, so align your addresses to 4-byte boundaries.
289
290 a2 = src
291 a3 = dst
292 a4 = len
293 a5 = sum
294 */
295 srli a10, a4, 1 /* 2-byte chunks */
296#if XCHAL_HAVE_LOOPS
297 loopgtz a10, 4f
298#else
299 beqz a10, 4f
300 slli a10, a10, 1
301 add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
302.Loop7:
303#endif
304EX(10f) l16ui a9, a2, 0
305EX(11f) s16i a9, a3, 0
306 ONES_ADD(a5, a9)
307 addi a2, a2, 2
308 addi a3, a3, 2
309#if !XCHAL_HAVE_LOOPS
310 blt a2, a10, .Loop7
311#endif
3124:
313 /* This section processes a possible trailing odd byte. */
314 _bbci.l a4, 0, 8f /* 1-byte chunk */
315EX(10f) l8ui a9, a2, 0
316EX(11f) s8i a9, a3, 0
317#ifdef __XTENSA_EB__
318 slli a9, a9, 8 /* shift byte to bits 8..15 */
319#endif
320 ONES_ADD(a5, a9)
3218:
322 mov a2, a5
323 retw
324
3255:
326 /* Control branch to here when either src or dst is odd. We
327 process all bytes using 8-bit accesses. Grossly inefficient,
328 so don't feed us an odd address. */
329
330 srli a10, a4, 1 /* handle in pairs for 16-bit csum */
331#if XCHAL_HAVE_LOOPS
332 loopgtz a10, 6f
333#else
334 beqz a10, 6f
335 slli a10, a10, 1
336 add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
337.Loop8:
338#endif
339EX(10f) l8ui a9, a2, 0
340EX(10f) l8ui a8, a2, 1
341EX(11f) s8i a9, a3, 0
342EX(11f) s8i a8, a3, 1
343#ifdef __XTENSA_EB__
344 slli a9, a9, 8 /* combine into a single 16-bit value */
345#else /* for checksum computation */
346 slli a8, a8, 8
347#endif
348 or a9, a9, a8
349 ONES_ADD(a5, a9)
350 addi a2, a2, 2
351 addi a3, a3, 2
352#if !XCHAL_HAVE_LOOPS
353 blt a2, a10, .Loop8
354#endif
3556:
356 j 4b /* process the possible trailing odd byte */
357
358ENDPROC(csum_partial_copy_generic)
359
360
361# Exception handler:
362.section .fixup, "ax"
363/*
364 a6 = src_err_ptr
365 a7 = dst_err_ptr
366 a11 = original len for exception handling
367 a12 = original dst for exception handling
368*/
369
37010:
371 _movi a2, -EFAULT
372 s32i a2, a6, 0 /* src_err_ptr */
373
374 # clear the complete destination - computing the rest
375 # is too much work
376 movi a2, 0
377#if XCHAL_HAVE_LOOPS
378 loopgtz a11, 2f
379#else
380 beqz a11, 2f
381 add a11, a11, a12 /* a11 = ending address */
382.Leloop:
383#endif
384 s8i a2, a12, 0
385 addi a12, a12, 1
386#if !XCHAL_HAVE_LOOPS
387 blt a12, a11, .Leloop
388#endif
3892:
390 retw
391
39211:
393 movi a2, -EFAULT
394 s32i a2, a7, 0 /* dst_err_ptr */
395 movi a2, 0
396 retw
397
398.previous