blob: 0a2b4c452343caf3a0d4a875293e5fc35f8ba550 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3Date: Fri, 8 Nov 2019 13:22:16 +0100
4Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
10
11This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
12Zinc patch set.
13
14Co-developed-by: René van Dorst <opensource@vdorst.com>
15Signed-off-by: René van Dorst <opensource@vdorst.com>
16Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
17Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
18Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
19Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
20---
21 arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
22 1 file changed, 424 insertions(+)
23 create mode 100644 arch/mips/crypto/chacha-core.S
24
25--- /dev/null
26+++ b/arch/mips/crypto/chacha-core.S
27@@ -0,0 +1,424 @@
28+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
29+/*
30+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
31+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
32+ */
33+
34+#define MASK_U32 0x3c
35+#define CHACHA20_BLOCK_SIZE 64
36+#define STACK_SIZE 32
37+
38+#define X0 $t0
39+#define X1 $t1
40+#define X2 $t2
41+#define X3 $t3
42+#define X4 $t4
43+#define X5 $t5
44+#define X6 $t6
45+#define X7 $t7
46+#define X8 $t8
47+#define X9 $t9
48+#define X10 $v1
49+#define X11 $s6
50+#define X12 $s5
51+#define X13 $s4
52+#define X14 $s3
53+#define X15 $s2
54+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
55+#define T0 $s1
56+#define T1 $s0
57+#define T(n) T ## n
58+#define X(n) X ## n
59+
60+/* Input arguments */
61+#define STATE $a0
62+#define OUT $a1
63+#define IN $a2
64+#define BYTES $a3
65+
66+/* Output argument */
67+/* NONCE[0] is kept in a register and not in memory.
68+ * We don't want to touch original value in memory.
69+ * Must be incremented every loop iteration.
70+ */
71+#define NONCE_0 $v0
72+
73+/* SAVED_X and SAVED_CA are set in the jump table.
74+ * Use regs which are overwritten on exit else we don't leak clear data.
75+ * They are used to handling the last bytes which are not multiple of 4.
76+ */
77+#define SAVED_X X15
78+#define SAVED_CA $s7
79+
80+#define IS_UNALIGNED $s7
81+
82+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
83+#define MSB 0
84+#define LSB 3
85+#define ROTx rotl
86+#define ROTR(n) rotr n, 24
87+#define CPU_TO_LE32(n) \
88+ wsbh n; \
89+ rotr n, 16;
90+#else
91+#define MSB 3
92+#define LSB 0
93+#define ROTx rotr
94+#define CPU_TO_LE32(n)
95+#define ROTR(n)
96+#endif
97+
98+#define FOR_EACH_WORD(x) \
99+ x( 0); \
100+ x( 1); \
101+ x( 2); \
102+ x( 3); \
103+ x( 4); \
104+ x( 5); \
105+ x( 6); \
106+ x( 7); \
107+ x( 8); \
108+ x( 9); \
109+ x(10); \
110+ x(11); \
111+ x(12); \
112+ x(13); \
113+ x(14); \
114+ x(15);
115+
116+#define FOR_EACH_WORD_REV(x) \
117+ x(15); \
118+ x(14); \
119+ x(13); \
120+ x(12); \
121+ x(11); \
122+ x(10); \
123+ x( 9); \
124+ x( 8); \
125+ x( 7); \
126+ x( 6); \
127+ x( 5); \
128+ x( 4); \
129+ x( 3); \
130+ x( 2); \
131+ x( 1); \
132+ x( 0);
133+
134+#define PLUS_ONE_0 1
135+#define PLUS_ONE_1 2
136+#define PLUS_ONE_2 3
137+#define PLUS_ONE_3 4
138+#define PLUS_ONE_4 5
139+#define PLUS_ONE_5 6
140+#define PLUS_ONE_6 7
141+#define PLUS_ONE_7 8
142+#define PLUS_ONE_8 9
143+#define PLUS_ONE_9 10
144+#define PLUS_ONE_10 11
145+#define PLUS_ONE_11 12
146+#define PLUS_ONE_12 13
147+#define PLUS_ONE_13 14
148+#define PLUS_ONE_14 15
149+#define PLUS_ONE_15 16
150+#define PLUS_ONE(x) PLUS_ONE_ ## x
151+#define _CONCAT3(a,b,c) a ## b ## c
152+#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
153+
154+#define STORE_UNALIGNED(x) \
155+CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
156+ .if (x != 12); \
157+ lw T0, (x*4)(STATE); \
158+ .endif; \
159+ lwl T1, (x*4)+MSB ## (IN); \
160+ lwr T1, (x*4)+LSB ## (IN); \
161+ .if (x == 12); \
162+ addu X ## x, NONCE_0; \
163+ .else; \
164+ addu X ## x, T0; \
165+ .endif; \
166+ CPU_TO_LE32(X ## x); \
167+ xor X ## x, T1; \
168+ swl X ## x, (x*4)+MSB ## (OUT); \
169+ swr X ## x, (x*4)+LSB ## (OUT);
170+
171+#define STORE_ALIGNED(x) \
172+CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
173+ .if (x != 12); \
174+ lw T0, (x*4)(STATE); \
175+ .endif; \
176+ lw T1, (x*4) ## (IN); \
177+ .if (x == 12); \
178+ addu X ## x, NONCE_0; \
179+ .else; \
180+ addu X ## x, T0; \
181+ .endif; \
182+ CPU_TO_LE32(X ## x); \
183+ xor X ## x, T1; \
184+ sw X ## x, (x*4) ## (OUT);
185+
186+/* Jump table macro.
187+ * Used for setup and handling the last bytes, which are not multiple of 4.
188+ * X15 is free to store Xn
189+ * Every jumptable entry must be equal in size.
190+ */
191+#define JMPTBL_ALIGNED(x) \
192+.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
193+ .set noreorder; \
194+ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
195+ .if (x == 12); \
196+ addu SAVED_X, X ## x, NONCE_0; \
197+ .else; \
198+ addu SAVED_X, X ## x, SAVED_CA; \
199+ .endif; \
200+ .set reorder
201+
202+#define JMPTBL_UNALIGNED(x) \
203+.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
204+ .set noreorder; \
205+ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
206+ .if (x == 12); \
207+ addu SAVED_X, X ## x, NONCE_0; \
208+ .else; \
209+ addu SAVED_X, X ## x, SAVED_CA; \
210+ .endif; \
211+ .set reorder
212+
213+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
214+ addu X(A), X(K); \
215+ addu X(B), X(L); \
216+ addu X(C), X(M); \
217+ addu X(D), X(N); \
218+ xor X(V), X(A); \
219+ xor X(W), X(B); \
220+ xor X(Y), X(C); \
221+ xor X(Z), X(D); \
222+ rotl X(V), S; \
223+ rotl X(W), S; \
224+ rotl X(Y), S; \
225+ rotl X(Z), S;
226+
227+.text
228+.set reorder
229+.set noat
230+.globl chacha20_mips
231+.ent chacha20_mips
232+chacha20_mips:
233+ .frame $sp, STACK_SIZE, $ra
234+
235+ addiu $sp, -STACK_SIZE
236+
237+ /* Return bytes = 0. */
238+ beqz BYTES, .Lchacha20_mips_end
239+
240+ lw NONCE_0, 48(STATE)
241+
242+ /* Save s0-s7 */
243+ sw $s0, 0($sp)
244+ sw $s1, 4($sp)
245+ sw $s2, 8($sp)
246+ sw $s3, 12($sp)
247+ sw $s4, 16($sp)
248+ sw $s5, 20($sp)
249+ sw $s6, 24($sp)
250+ sw $s7, 28($sp)
251+
252+ /* Test IN or OUT is unaligned.
253+ * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
254+ */
255+ or IS_UNALIGNED, IN, OUT
256+ andi IS_UNALIGNED, 0x3
257+
258+ /* Set number of rounds */
259+ li $at, 20
260+
261+ b .Lchacha20_rounds_start
262+
263+.align 4
264+.Loop_chacha20_rounds:
265+ addiu IN, CHACHA20_BLOCK_SIZE
266+ addiu OUT, CHACHA20_BLOCK_SIZE
267+ addiu NONCE_0, 1
268+
269+.Lchacha20_rounds_start:
270+ lw X0, 0(STATE)
271+ lw X1, 4(STATE)
272+ lw X2, 8(STATE)
273+ lw X3, 12(STATE)
274+
275+ lw X4, 16(STATE)
276+ lw X5, 20(STATE)
277+ lw X6, 24(STATE)
278+ lw X7, 28(STATE)
279+ lw X8, 32(STATE)
280+ lw X9, 36(STATE)
281+ lw X10, 40(STATE)
282+ lw X11, 44(STATE)
283+
284+ move X12, NONCE_0
285+ lw X13, 52(STATE)
286+ lw X14, 56(STATE)
287+ lw X15, 60(STATE)
288+
289+.Loop_chacha20_xor_rounds:
290+ addiu $at, -2
291+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
292+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
293+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
294+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
295+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
296+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
297+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
298+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
299+ bnez $at, .Loop_chacha20_xor_rounds
300+
301+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
302+
303+ /* Is data src/dst unaligned? Jump */
304+ bnez IS_UNALIGNED, .Loop_chacha20_unaligned
305+
306+ /* Set number rounds here to fill delayslot. */
307+ li $at, 20
308+
309+ /* BYTES < 0, it has no full block. */
310+ bltz BYTES, .Lchacha20_mips_no_full_block_aligned
311+
312+ FOR_EACH_WORD_REV(STORE_ALIGNED)
313+
314+ /* BYTES > 0? Loop again. */
315+ bgtz BYTES, .Loop_chacha20_rounds
316+
317+ /* Place this here to fill delay slot */
318+ addiu NONCE_0, 1
319+
320+ /* BYTES < 0? Handle last bytes */
321+ bltz BYTES, .Lchacha20_mips_xor_bytes
322+
323+.Lchacha20_mips_xor_done:
324+ /* Restore used registers */
325+ lw $s0, 0($sp)
326+ lw $s1, 4($sp)
327+ lw $s2, 8($sp)
328+ lw $s3, 12($sp)
329+ lw $s4, 16($sp)
330+ lw $s5, 20($sp)
331+ lw $s6, 24($sp)
332+ lw $s7, 28($sp)
333+
334+ /* Write NONCE_0 back to right location in state */
335+ sw NONCE_0, 48(STATE)
336+
337+.Lchacha20_mips_end:
338+ addiu $sp, STACK_SIZE
339+ jr $ra
340+
341+.Lchacha20_mips_no_full_block_aligned:
342+ /* Restore the offset on BYTES */
343+ addiu BYTES, CHACHA20_BLOCK_SIZE
344+
345+ /* Get number of full WORDS */
346+ andi $at, BYTES, MASK_U32
347+
348+ /* Load upper half of jump table addr */
349+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
350+
351+ /* Calculate lower half jump table offset */
352+ ins T0, $at, 1, 6
353+
354+ /* Add offset to STATE */
355+ addu T1, STATE, $at
356+
357+ /* Add lower half jump table addr */
358+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
359+
360+ /* Read value from STATE */
361+ lw SAVED_CA, 0(T1)
362+
363+ /* Store remaining bytecounter as negative value */
364+ subu BYTES, $at, BYTES
365+
366+ jr T0
367+
368+ /* Jump table */
369+ FOR_EACH_WORD(JMPTBL_ALIGNED)
370+
371+
372+.Loop_chacha20_unaligned:
373+ /* Set number rounds here to fill delayslot. */
374+ li $at, 20
375+
376+ /* BYTES > 0, it has no full block. */
377+ bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
378+
379+ FOR_EACH_WORD_REV(STORE_UNALIGNED)
380+
381+ /* BYTES > 0? Loop again. */
382+ bgtz BYTES, .Loop_chacha20_rounds
383+
384+ /* Write NONCE_0 back to right location in state */
385+ sw NONCE_0, 48(STATE)
386+
387+ .set noreorder
388+ /* Fall through to byte handling */
389+ bgez BYTES, .Lchacha20_mips_xor_done
390+.Lchacha20_mips_xor_unaligned_0_b:
391+.Lchacha20_mips_xor_aligned_0_b:
392+ /* Place this here to fill delay slot */
393+ addiu NONCE_0, 1
394+ .set reorder
395+
396+.Lchacha20_mips_xor_bytes:
397+ addu IN, $at
398+ addu OUT, $at
399+ /* First byte */
400+ lbu T1, 0(IN)
401+ addiu $at, BYTES, 1
402+ CPU_TO_LE32(SAVED_X)
403+ ROTR(SAVED_X)
404+ xor T1, SAVED_X
405+ sb T1, 0(OUT)
406+ beqz $at, .Lchacha20_mips_xor_done
407+ /* Second byte */
408+ lbu T1, 1(IN)
409+ addiu $at, BYTES, 2
410+ ROTx SAVED_X, 8
411+ xor T1, SAVED_X
412+ sb T1, 1(OUT)
413+ beqz $at, .Lchacha20_mips_xor_done
414+ /* Third byte */
415+ lbu T1, 2(IN)
416+ ROTx SAVED_X, 8
417+ xor T1, SAVED_X
418+ sb T1, 2(OUT)
419+ b .Lchacha20_mips_xor_done
420+
421+.Lchacha20_mips_no_full_block_unaligned:
422+ /* Restore the offset on BYTES */
423+ addiu BYTES, CHACHA20_BLOCK_SIZE
424+
425+ /* Get number of full WORDS */
426+ andi $at, BYTES, MASK_U32
427+
428+ /* Load upper half of jump table addr */
429+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
430+
431+ /* Calculate lower half jump table offset */
432+ ins T0, $at, 1, 6
433+
434+ /* Add offset to STATE */
435+ addu T1, STATE, $at
436+
437+ /* Add lower half jump table addr */
438+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
439+
440+ /* Read value from STATE */
441+ lw SAVED_CA, 0(T1)
442+
443+ /* Store remaining bytecounter as negative value */
444+ subu BYTES, $at, BYTES
445+
446+ jr T0
447+
448+ /* Jump table */
449+ FOR_EACH_WORD(JMPTBL_UNALIGNED)
450+.end chacha20_mips
451+.set at