blob: 978f2f55beba80cb9b09e99b660fff3ac3a311b6 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Ard Biesheuvel <ardb@kernel.org>
3Date: Fri, 8 Nov 2019 13:22:13 +0100
4Subject: [PATCH] crypto: arm/chacha - import Eric Biggers's scalar accelerated
5 ChaCha code
6
7commit 29621d099f9c642b22a69dc8e7e20c108473a392 upstream.
8
9Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
10Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
11Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
12---
13 arch/arm/crypto/chacha-scalar-core.S | 461 +++++++++++++++++++++++++++
14 1 file changed, 461 insertions(+)
15 create mode 100644 arch/arm/crypto/chacha-scalar-core.S
16
17--- /dev/null
18+++ b/arch/arm/crypto/chacha-scalar-core.S
19@@ -0,0 +1,461 @@
20+/* SPDX-License-Identifier: GPL-2.0 */
21+/*
22+ * Copyright (C) 2018 Google, Inc.
23+ */
24+
25+#include <linux/linkage.h>
26+#include <asm/assembler.h>
27+
28+/*
29+ * Design notes:
30+ *
31+ * 16 registers would be needed to hold the state matrix, but only 14 are
32+ * available because 'sp' and 'pc' cannot be used. So we spill the elements
33+ * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
34+ * 'ldrd' and one 'strd' instruction per round.
35+ *
36+ * All rotates are performed using the implicit rotate operand accepted by the
37+ * 'add' and 'eor' instructions. This is faster than using explicit rotate
38+ * instructions. To make this work, we allow the values in the second and last
39+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
40+ * wrong rotation amount. The rotation amount is then fixed up just in time
41+ * when the values are used. 'brot' is the number of bits the values in row 'b'
42+ * need to be rotated right to arrive at the correct values, and 'drot'
43+ * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
44+ * that they end up as (25, 24) after every round.
45+ */
46+
47+ // ChaCha state registers
48+ X0 .req r0
49+ X1 .req r1
50+ X2 .req r2
51+ X3 .req r3
52+ X4 .req r4
53+ X5 .req r5
54+ X6 .req r6
55+ X7 .req r7
56+ X8_X10 .req r8 // shared by x8 and x10
57+ X9_X11 .req r9 // shared by x9 and x11
58+ X12 .req r10
59+ X13 .req r11
60+ X14 .req r12
61+ X15 .req r14
62+
63+.Lexpand_32byte_k:
64+ // "expand 32-byte k"
65+ .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
66+
67+#ifdef __thumb2__
68+# define adrl adr
69+#endif
70+
71+.macro __rev out, in, t0, t1, t2
72+.if __LINUX_ARM_ARCH__ >= 6
73+ rev \out, \in
74+.else
75+ lsl \t0, \in, #24
76+ and \t1, \in, #0xff00
77+ and \t2, \in, #0xff0000
78+ orr \out, \t0, \in, lsr #24
79+ orr \out, \out, \t1, lsl #8
80+ orr \out, \out, \t2, lsr #8
81+.endif
82+.endm
83+
84+.macro _le32_bswap x, t0, t1, t2
85+#ifdef __ARMEB__
86+ __rev \x, \x, \t0, \t1, \t2
87+#endif
88+.endm
89+
90+.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
91+ _le32_bswap \a, \t0, \t1, \t2
92+ _le32_bswap \b, \t0, \t1, \t2
93+ _le32_bswap \c, \t0, \t1, \t2
94+ _le32_bswap \d, \t0, \t1, \t2
95+.endm
96+
97+.macro __ldrd a, b, src, offset
98+#if __LINUX_ARM_ARCH__ >= 6
99+ ldrd \a, \b, [\src, #\offset]
100+#else
101+ ldr \a, [\src, #\offset]
102+ ldr \b, [\src, #\offset + 4]
103+#endif
104+.endm
105+
106+.macro __strd a, b, dst, offset
107+#if __LINUX_ARM_ARCH__ >= 6
108+ strd \a, \b, [\dst, #\offset]
109+#else
110+ str \a, [\dst, #\offset]
111+ str \b, [\dst, #\offset + 4]
112+#endif
113+.endm
114+
115+.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
116+
117+ // a += b; d ^= a; d = rol(d, 16);
118+ add \a1, \a1, \b1, ror #brot
119+ add \a2, \a2, \b2, ror #brot
120+ eor \d1, \a1, \d1, ror #drot
121+ eor \d2, \a2, \d2, ror #drot
122+ // drot == 32 - 16 == 16
123+
124+ // c += d; b ^= c; b = rol(b, 12);
125+ add \c1, \c1, \d1, ror #16
126+ add \c2, \c2, \d2, ror #16
127+ eor \b1, \c1, \b1, ror #brot
128+ eor \b2, \c2, \b2, ror #brot
129+ // brot == 32 - 12 == 20
130+
131+ // a += b; d ^= a; d = rol(d, 8);
132+ add \a1, \a1, \b1, ror #20
133+ add \a2, \a2, \b2, ror #20
134+ eor \d1, \a1, \d1, ror #16
135+ eor \d2, \a2, \d2, ror #16
136+ // drot == 32 - 8 == 24
137+
138+ // c += d; b ^= c; b = rol(b, 7);
139+ add \c1, \c1, \d1, ror #24
140+ add \c2, \c2, \d2, ror #24
141+ eor \b1, \c1, \b1, ror #20
142+ eor \b2, \c2, \b2, ror #20
143+ // brot == 32 - 7 == 25
144+.endm
145+
146+.macro _doubleround
147+
148+ // column round
149+
150+ // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
151+ _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
152+
153+ // save (x8, x9); restore (x10, x11)
154+ __strd X8_X10, X9_X11, sp, 0
155+ __ldrd X8_X10, X9_X11, sp, 8
156+
157+ // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
158+ _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
159+
160+ .set brot, 25
161+ .set drot, 24
162+
163+ // diagonal round
164+
165+ // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
166+ _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
167+
168+ // save (x10, x11); restore (x8, x9)
169+ __strd X8_X10, X9_X11, sp, 8
170+ __ldrd X8_X10, X9_X11, sp, 0
171+
172+ // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
173+ _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
174+.endm
175+
176+.macro _chacha_permute nrounds
177+ .set brot, 0
178+ .set drot, 0
179+ .rept \nrounds / 2
180+ _doubleround
181+ .endr
182+.endm
183+
184+.macro _chacha nrounds
185+
186+.Lnext_block\@:
187+ // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
188+ // Registers contain x0-x9,x12-x15.
189+
190+ // Do the core ChaCha permutation to update x0-x15.
191+ _chacha_permute \nrounds
192+
193+ add sp, #8
194+ // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
195+ // Registers contain x0-x9,x12-x15.
196+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
197+
198+ // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
199+ push {X8_X10, X9_X11, X12, X13, X14, X15}
200+
201+ // Load (OUT, IN, LEN).
202+ ldr r14, [sp, #96]
203+ ldr r12, [sp, #100]
204+ ldr r11, [sp, #104]
205+
206+ orr r10, r14, r12
207+
208+ // Use slow path if fewer than 64 bytes remain.
209+ cmp r11, #64
210+ blt .Lxor_slowpath\@
211+
212+ // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
213+ // ARMv6+, since ldmia and stmia (used below) still require alignment.
214+ tst r10, #3
215+ bne .Lxor_slowpath\@
216+
217+ // Fast path: XOR 64 bytes of aligned data.
218+
219+ // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
220+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
221+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
222+
223+ // x0-x3
224+ __ldrd r8, r9, sp, 32
225+ __ldrd r10, r11, sp, 40
226+ add X0, X0, r8
227+ add X1, X1, r9
228+ add X2, X2, r10
229+ add X3, X3, r11
230+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
231+ ldmia r12!, {r8-r11}
232+ eor X0, X0, r8
233+ eor X1, X1, r9
234+ eor X2, X2, r10
235+ eor X3, X3, r11
236+ stmia r14!, {X0-X3}
237+
238+ // x4-x7
239+ __ldrd r8, r9, sp, 48
240+ __ldrd r10, r11, sp, 56
241+ add X4, r8, X4, ror #brot
242+ add X5, r9, X5, ror #brot
243+ ldmia r12!, {X0-X3}
244+ add X6, r10, X6, ror #brot
245+ add X7, r11, X7, ror #brot
246+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
247+ eor X4, X4, X0
248+ eor X5, X5, X1
249+ eor X6, X6, X2
250+ eor X7, X7, X3
251+ stmia r14!, {X4-X7}
252+
253+ // x8-x15
254+ pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
255+ __ldrd r8, r9, sp, 32
256+ __ldrd r10, r11, sp, 40
257+ add r0, r0, r8 // x8
258+ add r1, r1, r9 // x9
259+ add r6, r6, r10 // x10
260+ add r7, r7, r11 // x11
261+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
262+ ldmia r12!, {r8-r11}
263+ eor r0, r0, r8 // x8
264+ eor r1, r1, r9 // x9
265+ eor r6, r6, r10 // x10
266+ eor r7, r7, r11 // x11
267+ stmia r14!, {r0,r1,r6,r7}
268+ ldmia r12!, {r0,r1,r6,r7}
269+ __ldrd r8, r9, sp, 48
270+ __ldrd r10, r11, sp, 56
271+ add r2, r8, r2, ror #drot // x12
272+ add r3, r9, r3, ror #drot // x13
273+ add r4, r10, r4, ror #drot // x14
274+ add r5, r11, r5, ror #drot // x15
275+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
276+ ldr r9, [sp, #72] // load LEN
277+ eor r2, r2, r0 // x12
278+ eor r3, r3, r1 // x13
279+ eor r4, r4, r6 // x14
280+ eor r5, r5, r7 // x15
281+ subs r9, #64 // decrement and check LEN
282+ stmia r14!, {r2-r5}
283+
284+ beq .Ldone\@
285+
286+.Lprepare_for_next_block\@:
287+
288+ // Stack: x0-x15 OUT IN LEN
289+
290+ // Increment block counter (x12)
291+ add r8, #1
292+
293+ // Store updated (OUT, IN, LEN)
294+ str r14, [sp, #64]
295+ str r12, [sp, #68]
296+ str r9, [sp, #72]
297+
298+ mov r14, sp
299+
300+ // Store updated block counter (x12)
301+ str r8, [sp, #48]
302+
303+ sub sp, #16
304+
305+ // Reload state and do next block
306+ ldmia r14!, {r0-r11} // load x0-x11
307+ __strd r10, r11, sp, 8 // store x10-x11 before state
308+ ldmia r14, {r10-r12,r14} // load x12-x15
309+ b .Lnext_block\@
310+
311+.Lxor_slowpath\@:
312+ // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
313+ // We handle it by storing the 64 bytes of keystream to the stack, then
314+ // XOR-ing the needed portion with the data.
315+
316+ // Allocate keystream buffer
317+ sub sp, #64
318+ mov r14, sp
319+
320+ // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
321+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
322+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
323+
324+ // Save keystream for x0-x3
325+ __ldrd r8, r9, sp, 96
326+ __ldrd r10, r11, sp, 104
327+ add X0, X0, r8
328+ add X1, X1, r9
329+ add X2, X2, r10
330+ add X3, X3, r11
331+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
332+ stmia r14!, {X0-X3}
333+
334+ // Save keystream for x4-x7
335+ __ldrd r8, r9, sp, 112
336+ __ldrd r10, r11, sp, 120
337+ add X4, r8, X4, ror #brot
338+ add X5, r9, X5, ror #brot
339+ add X6, r10, X6, ror #brot
340+ add X7, r11, X7, ror #brot
341+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
342+ add r8, sp, #64
343+ stmia r14!, {X4-X7}
344+
345+ // Save keystream for x8-x15
346+ ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
347+ __ldrd r8, r9, sp, 128
348+ __ldrd r10, r11, sp, 136
349+ add r0, r0, r8 // x8
350+ add r1, r1, r9 // x9
351+ add r6, r6, r10 // x10
352+ add r7, r7, r11 // x11
353+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
354+ stmia r14!, {r0,r1,r6,r7}
355+ __ldrd r8, r9, sp, 144
356+ __ldrd r10, r11, sp, 152
357+ add r2, r8, r2, ror #drot // x12
358+ add r3, r9, r3, ror #drot // x13
359+ add r4, r10, r4, ror #drot // x14
360+ add r5, r11, r5, ror #drot // x15
361+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
362+ stmia r14, {r2-r5}
363+
364+ // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
365+ // Registers: r8 is block counter, r12 is IN.
366+
367+ ldr r9, [sp, #168] // LEN
368+ ldr r14, [sp, #160] // OUT
369+ cmp r9, #64
370+ mov r0, sp
371+ movle r1, r9
372+ movgt r1, #64
373+ // r1 is number of bytes to XOR, in range [1, 64]
374+
375+.if __LINUX_ARM_ARCH__ < 6
376+ orr r2, r12, r14
377+ tst r2, #3 // IN or OUT misaligned?
378+ bne .Lxor_next_byte\@
379+.endif
380+
381+ // XOR a word at a time
382+.rept 16
383+ subs r1, #4
384+ blt .Lxor_words_done\@
385+ ldr r2, [r12], #4
386+ ldr r3, [r0], #4
387+ eor r2, r2, r3
388+ str r2, [r14], #4
389+.endr
390+ b .Lxor_slowpath_done\@
391+.Lxor_words_done\@:
392+ ands r1, r1, #3
393+ beq .Lxor_slowpath_done\@
394+
395+ // XOR a byte at a time
396+.Lxor_next_byte\@:
397+ ldrb r2, [r12], #1
398+ ldrb r3, [r0], #1
399+ eor r2, r2, r3
400+ strb r2, [r14], #1
401+ subs r1, #1
402+ bne .Lxor_next_byte\@
403+
404+.Lxor_slowpath_done\@:
405+ subs r9, #64
406+ add sp, #96
407+ bgt .Lprepare_for_next_block\@
408+
409+.Ldone\@:
410+.endm // _chacha
411+
412+/*
413+ * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
414+ * const u32 iv[4]);
415+ */
416+ENTRY(chacha20_arm)
417+ cmp r2, #0 // len == 0?
418+ reteq lr
419+
420+ push {r0-r2,r4-r11,lr}
421+
422+ // Push state x0-x15 onto stack.
423+ // Also store an extra copy of x10-x11 just before the state.
424+
425+ ldr r4, [sp, #48] // iv
426+ mov r0, sp
427+ sub sp, #80
428+
429+ // iv: x12-x15
430+ ldm r4, {X12,X13,X14,X15}
431+ stmdb r0!, {X12,X13,X14,X15}
432+
433+ // key: x4-x11
434+ __ldrd X8_X10, X9_X11, r3, 24
435+ __strd X8_X10, X9_X11, sp, 8
436+ stmdb r0!, {X8_X10, X9_X11}
437+ ldm r3, {X4-X9_X11}
438+ stmdb r0!, {X4-X9_X11}
439+
440+ // constants: x0-x3
441+ adrl X3, .Lexpand_32byte_k
442+ ldm X3, {X0-X3}
443+ __strd X0, X1, sp, 16
444+ __strd X2, X3, sp, 24
445+
446+ _chacha 20
447+
448+ add sp, #76
449+ pop {r4-r11, pc}
450+ENDPROC(chacha20_arm)
451+
452+/*
453+ * void hchacha20_arm(const u32 state[16], u32 out[8]);
454+ */
455+ENTRY(hchacha20_arm)
456+ push {r1,r4-r11,lr}
457+
458+ mov r14, r0
459+ ldmia r14!, {r0-r11} // load x0-x11
460+ push {r10-r11} // store x10-x11 to stack
461+ ldm r14, {r10-r12,r14} // load x12-x15
462+ sub sp, #8
463+
464+ _chacha_permute 20
465+
466+ // Skip over (unused0-unused1, x10-x11)
467+ add sp, #16
468+
469+ // Fix up rotations of x12-x15
470+ ror X12, X12, #drot
471+ ror X13, X13, #drot
472+ pop {r4} // load 'out'
473+ ror X14, X14, #drot
474+ ror X15, X15, #drot
475+
476+ // Store (x0-x3,x12-x15) to 'out'
477+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
478+
479+ pop {r4-r11,pc}
480+ENDPROC(hchacha20_arm)