blob: 04405581d2b996c02751dbe4c2121d2ef09c2b1f [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3Date: Fri, 8 Nov 2019 13:22:31 +0100
4Subject: [PATCH] crypto: blake2s - x86_64 SIMD implementation
5
6commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
7
8These implementations from Samuel Neves support AVX and AVX-512VL.
9Originally this used AVX-512F, but Skylake thermal throttling made
10AVX-512VL more attractive and possible to do with negligable difference.
11
12Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
13Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
14Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
15[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
16Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
17Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
18Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
19---
20 arch/x86/crypto/Makefile | 2 +
21 arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
22 arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
23 crypto/Kconfig | 6 +
24 4 files changed, 499 insertions(+)
25 create mode 100644 arch/x86/crypto/blake2s-core.S
26 create mode 100644 arch/x86/crypto/blake2s-glue.c
27
28--- a/arch/x86/crypto/Makefile
29+++ b/arch/x86/crypto/Makefile
30@@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
31 obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
32 obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
33 obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
34+ obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
35 endif
36
37 # These modules require assembler to support AVX2.
38@@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
39 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
40
41 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
42+blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
43
44 ifeq ($(avx_supported),yes)
45 camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
46--- /dev/null
47+++ b/arch/x86/crypto/blake2s-core.S
48@@ -0,0 +1,258 @@
49+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
50+/*
51+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
52+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
53+ */
54+
55+#include <linux/linkage.h>
56+
57+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
58+.align 32
59+IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
60+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
61+.section .rodata.cst16.ROT16, "aM", @progbits, 16
62+.align 16
63+ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
64+.section .rodata.cst16.ROR328, "aM", @progbits, 16
65+.align 16
66+ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
67+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
68+.align 64
69+SIGMA:
70+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
71+.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
72+.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
73+.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
74+.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
75+.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
76+.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
77+.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
78+.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
79+.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
80+#ifdef CONFIG_AS_AVX512
81+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
82+.align 64
83+SIGMA2:
84+.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
85+.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
86+.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
87+.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
88+.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
89+.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
90+.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
91+.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
92+.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
93+.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
94+#endif /* CONFIG_AS_AVX512 */
95+
96+.text
97+#ifdef CONFIG_AS_SSSE3
98+ENTRY(blake2s_compress_ssse3)
99+ testq %rdx,%rdx
100+ je .Lendofloop
101+ movdqu (%rdi),%xmm0
102+ movdqu 0x10(%rdi),%xmm1
103+ movdqa ROT16(%rip),%xmm12
104+ movdqa ROR328(%rip),%xmm13
105+ movdqu 0x20(%rdi),%xmm14
106+ movq %rcx,%xmm15
107+ leaq SIGMA+0xa0(%rip),%r8
108+ jmp .Lbeginofloop
109+ .align 32
110+.Lbeginofloop:
111+ movdqa %xmm0,%xmm10
112+ movdqa %xmm1,%xmm11
113+ paddq %xmm15,%xmm14
114+ movdqa IV(%rip),%xmm2
115+ movdqa %xmm14,%xmm3
116+ pxor IV+0x10(%rip),%xmm3
117+ leaq SIGMA(%rip),%rcx
118+.Lroundloop:
119+ movzbl (%rcx),%eax
120+ movd (%rsi,%rax,4),%xmm4
121+ movzbl 0x1(%rcx),%eax
122+ movd (%rsi,%rax,4),%xmm5
123+ movzbl 0x2(%rcx),%eax
124+ movd (%rsi,%rax,4),%xmm6
125+ movzbl 0x3(%rcx),%eax
126+ movd (%rsi,%rax,4),%xmm7
127+ punpckldq %xmm5,%xmm4
128+ punpckldq %xmm7,%xmm6
129+ punpcklqdq %xmm6,%xmm4
130+ paddd %xmm4,%xmm0
131+ paddd %xmm1,%xmm0
132+ pxor %xmm0,%xmm3
133+ pshufb %xmm12,%xmm3
134+ paddd %xmm3,%xmm2
135+ pxor %xmm2,%xmm1
136+ movdqa %xmm1,%xmm8
137+ psrld $0xc,%xmm1
138+ pslld $0x14,%xmm8
139+ por %xmm8,%xmm1
140+ movzbl 0x4(%rcx),%eax
141+ movd (%rsi,%rax,4),%xmm5
142+ movzbl 0x5(%rcx),%eax
143+ movd (%rsi,%rax,4),%xmm6
144+ movzbl 0x6(%rcx),%eax
145+ movd (%rsi,%rax,4),%xmm7
146+ movzbl 0x7(%rcx),%eax
147+ movd (%rsi,%rax,4),%xmm4
148+ punpckldq %xmm6,%xmm5
149+ punpckldq %xmm4,%xmm7
150+ punpcklqdq %xmm7,%xmm5
151+ paddd %xmm5,%xmm0
152+ paddd %xmm1,%xmm0
153+ pxor %xmm0,%xmm3
154+ pshufb %xmm13,%xmm3
155+ paddd %xmm3,%xmm2
156+ pxor %xmm2,%xmm1
157+ movdqa %xmm1,%xmm8
158+ psrld $0x7,%xmm1
159+ pslld $0x19,%xmm8
160+ por %xmm8,%xmm1
161+ pshufd $0x93,%xmm0,%xmm0
162+ pshufd $0x4e,%xmm3,%xmm3
163+ pshufd $0x39,%xmm2,%xmm2
164+ movzbl 0x8(%rcx),%eax
165+ movd (%rsi,%rax,4),%xmm6
166+ movzbl 0x9(%rcx),%eax
167+ movd (%rsi,%rax,4),%xmm7
168+ movzbl 0xa(%rcx),%eax
169+ movd (%rsi,%rax,4),%xmm4
170+ movzbl 0xb(%rcx),%eax
171+ movd (%rsi,%rax,4),%xmm5
172+ punpckldq %xmm7,%xmm6
173+ punpckldq %xmm5,%xmm4
174+ punpcklqdq %xmm4,%xmm6
175+ paddd %xmm6,%xmm0
176+ paddd %xmm1,%xmm0
177+ pxor %xmm0,%xmm3
178+ pshufb %xmm12,%xmm3
179+ paddd %xmm3,%xmm2
180+ pxor %xmm2,%xmm1
181+ movdqa %xmm1,%xmm8
182+ psrld $0xc,%xmm1
183+ pslld $0x14,%xmm8
184+ por %xmm8,%xmm1
185+ movzbl 0xc(%rcx),%eax
186+ movd (%rsi,%rax,4),%xmm7
187+ movzbl 0xd(%rcx),%eax
188+ movd (%rsi,%rax,4),%xmm4
189+ movzbl 0xe(%rcx),%eax
190+ movd (%rsi,%rax,4),%xmm5
191+ movzbl 0xf(%rcx),%eax
192+ movd (%rsi,%rax,4),%xmm6
193+ punpckldq %xmm4,%xmm7
194+ punpckldq %xmm6,%xmm5
195+ punpcklqdq %xmm5,%xmm7
196+ paddd %xmm7,%xmm0
197+ paddd %xmm1,%xmm0
198+ pxor %xmm0,%xmm3
199+ pshufb %xmm13,%xmm3
200+ paddd %xmm3,%xmm2
201+ pxor %xmm2,%xmm1
202+ movdqa %xmm1,%xmm8
203+ psrld $0x7,%xmm1
204+ pslld $0x19,%xmm8
205+ por %xmm8,%xmm1
206+ pshufd $0x39,%xmm0,%xmm0
207+ pshufd $0x4e,%xmm3,%xmm3
208+ pshufd $0x93,%xmm2,%xmm2
209+ addq $0x10,%rcx
210+ cmpq %r8,%rcx
211+ jnz .Lroundloop
212+ pxor %xmm2,%xmm0
213+ pxor %xmm3,%xmm1
214+ pxor %xmm10,%xmm0
215+ pxor %xmm11,%xmm1
216+ addq $0x40,%rsi
217+ decq %rdx
218+ jnz .Lbeginofloop
219+ movdqu %xmm0,(%rdi)
220+ movdqu %xmm1,0x10(%rdi)
221+ movdqu %xmm14,0x20(%rdi)
222+.Lendofloop:
223+ ret
224+ENDPROC(blake2s_compress_ssse3)
225+#endif /* CONFIG_AS_SSSE3 */
226+
227+#ifdef CONFIG_AS_AVX512
228+ENTRY(blake2s_compress_avx512)
229+ vmovdqu (%rdi),%xmm0
230+ vmovdqu 0x10(%rdi),%xmm1
231+ vmovdqu 0x20(%rdi),%xmm4
232+ vmovq %rcx,%xmm5
233+ vmovdqa IV(%rip),%xmm14
234+ vmovdqa IV+16(%rip),%xmm15
235+ jmp .Lblake2s_compress_avx512_mainloop
236+.align 32
237+.Lblake2s_compress_avx512_mainloop:
238+ vmovdqa %xmm0,%xmm10
239+ vmovdqa %xmm1,%xmm11
240+ vpaddq %xmm5,%xmm4,%xmm4
241+ vmovdqa %xmm14,%xmm2
242+ vpxor %xmm15,%xmm4,%xmm3
243+ vmovdqu (%rsi),%ymm6
244+ vmovdqu 0x20(%rsi),%ymm7
245+ addq $0x40,%rsi
246+ leaq SIGMA2(%rip),%rax
247+ movb $0xa,%cl
248+.Lblake2s_compress_avx512_roundloop:
249+ addq $0x40,%rax
250+ vmovdqa -0x40(%rax),%ymm8
251+ vmovdqa -0x20(%rax),%ymm9
252+ vpermi2d %ymm7,%ymm6,%ymm8
253+ vpermi2d %ymm7,%ymm6,%ymm9
254+ vmovdqa %ymm8,%ymm6
255+ vmovdqa %ymm9,%ymm7
256+ vpaddd %xmm8,%xmm0,%xmm0
257+ vpaddd %xmm1,%xmm0,%xmm0
258+ vpxor %xmm0,%xmm3,%xmm3
259+ vprord $0x10,%xmm3,%xmm3
260+ vpaddd %xmm3,%xmm2,%xmm2
261+ vpxor %xmm2,%xmm1,%xmm1
262+ vprord $0xc,%xmm1,%xmm1
263+ vextracti128 $0x1,%ymm8,%xmm8
264+ vpaddd %xmm8,%xmm0,%xmm0
265+ vpaddd %xmm1,%xmm0,%xmm0
266+ vpxor %xmm0,%xmm3,%xmm3
267+ vprord $0x8,%xmm3,%xmm3
268+ vpaddd %xmm3,%xmm2,%xmm2
269+ vpxor %xmm2,%xmm1,%xmm1
270+ vprord $0x7,%xmm1,%xmm1
271+ vpshufd $0x93,%xmm0,%xmm0
272+ vpshufd $0x4e,%xmm3,%xmm3
273+ vpshufd $0x39,%xmm2,%xmm2
274+ vpaddd %xmm9,%xmm0,%xmm0
275+ vpaddd %xmm1,%xmm0,%xmm0
276+ vpxor %xmm0,%xmm3,%xmm3
277+ vprord $0x10,%xmm3,%xmm3
278+ vpaddd %xmm3,%xmm2,%xmm2
279+ vpxor %xmm2,%xmm1,%xmm1
280+ vprord $0xc,%xmm1,%xmm1
281+ vextracti128 $0x1,%ymm9,%xmm9
282+ vpaddd %xmm9,%xmm0,%xmm0
283+ vpaddd %xmm1,%xmm0,%xmm0
284+ vpxor %xmm0,%xmm3,%xmm3
285+ vprord $0x8,%xmm3,%xmm3
286+ vpaddd %xmm3,%xmm2,%xmm2
287+ vpxor %xmm2,%xmm1,%xmm1
288+ vprord $0x7,%xmm1,%xmm1
289+ vpshufd $0x39,%xmm0,%xmm0
290+ vpshufd $0x4e,%xmm3,%xmm3
291+ vpshufd $0x93,%xmm2,%xmm2
292+ decb %cl
293+ jne .Lblake2s_compress_avx512_roundloop
294+ vpxor %xmm10,%xmm0,%xmm0
295+ vpxor %xmm11,%xmm1,%xmm1
296+ vpxor %xmm2,%xmm0,%xmm0
297+ vpxor %xmm3,%xmm1,%xmm1
298+ decq %rdx
299+ jne .Lblake2s_compress_avx512_mainloop
300+ vmovdqu %xmm0,(%rdi)
301+ vmovdqu %xmm1,0x10(%rdi)
302+ vmovdqu %xmm4,0x20(%rdi)
303+ vzeroupper
304+ retq
305+ENDPROC(blake2s_compress_avx512)
306+#endif /* CONFIG_AS_AVX512 */
307--- /dev/null
308+++ b/arch/x86/crypto/blake2s-glue.c
309@@ -0,0 +1,233 @@
310+// SPDX-License-Identifier: GPL-2.0 OR MIT
311+/*
312+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
313+ */
314+
315+#include <crypto/internal/blake2s.h>
316+#include <crypto/internal/simd.h>
317+#include <crypto/internal/hash.h>
318+
319+#include <linux/types.h>
320+#include <linux/jump_label.h>
321+#include <linux/kernel.h>
322+#include <linux/module.h>
323+
324+#include <asm/cpufeature.h>
325+#include <asm/fpu/api.h>
326+#include <asm/processor.h>
327+#include <asm/simd.h>
328+
329+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
330+ const u8 *block, const size_t nblocks,
331+ const u32 inc);
332+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
333+ const u8 *block, const size_t nblocks,
334+ const u32 inc);
335+
336+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
337+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
338+
339+void blake2s_compress_arch(struct blake2s_state *state,
340+ const u8 *block, size_t nblocks,
341+ const u32 inc)
342+{
343+ /* SIMD disables preemption, so relax after processing each page. */
344+ BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
345+
346+ if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
347+ blake2s_compress_generic(state, block, nblocks, inc);
348+ return;
349+ }
350+
351+ for (;;) {
352+ const size_t blocks = min_t(size_t, nblocks,
353+ PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
354+
355+ kernel_fpu_begin();
356+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
357+ static_branch_likely(&blake2s_use_avx512))
358+ blake2s_compress_avx512(state, block, blocks, inc);
359+ else
360+ blake2s_compress_ssse3(state, block, blocks, inc);
361+ kernel_fpu_end();
362+
363+ nblocks -= blocks;
364+ if (!nblocks)
365+ break;
366+ block += blocks * BLAKE2S_BLOCK_SIZE;
367+ }
368+}
369+EXPORT_SYMBOL(blake2s_compress_arch);
370+
371+static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
372+ unsigned int keylen)
373+{
374+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
375+
376+ if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
377+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
378+ return -EINVAL;
379+ }
380+
381+ memcpy(tctx->key, key, keylen);
382+ tctx->keylen = keylen;
383+
384+ return 0;
385+}
386+
387+static int crypto_blake2s_init(struct shash_desc *desc)
388+{
389+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
390+ struct blake2s_state *state = shash_desc_ctx(desc);
391+ const int outlen = crypto_shash_digestsize(desc->tfm);
392+
393+ if (tctx->keylen)
394+ blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
395+ else
396+ blake2s_init(state, outlen);
397+
398+ return 0;
399+}
400+
401+static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
402+ unsigned int inlen)
403+{
404+ struct blake2s_state *state = shash_desc_ctx(desc);
405+ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
406+
407+ if (unlikely(!inlen))
408+ return 0;
409+ if (inlen > fill) {
410+ memcpy(state->buf + state->buflen, in, fill);
411+ blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
412+ state->buflen = 0;
413+ in += fill;
414+ inlen -= fill;
415+ }
416+ if (inlen > BLAKE2S_BLOCK_SIZE) {
417+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
418+ /* Hash one less (full) block than strictly possible */
419+ blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
420+ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
421+ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
422+ }
423+ memcpy(state->buf + state->buflen, in, inlen);
424+ state->buflen += inlen;
425+
426+ return 0;
427+}
428+
429+static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
430+{
431+ struct blake2s_state *state = shash_desc_ctx(desc);
432+
433+ blake2s_set_lastblock(state);
434+ memset(state->buf + state->buflen, 0,
435+ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
436+ blake2s_compress_arch(state, state->buf, 1, state->buflen);
437+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
438+ memcpy(out, state->h, state->outlen);
439+ memzero_explicit(state, sizeof(*state));
440+
441+ return 0;
442+}
443+
444+static struct shash_alg blake2s_algs[] = {{
445+ .base.cra_name = "blake2s-128",
446+ .base.cra_driver_name = "blake2s-128-x86",
447+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
448+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
449+ .base.cra_priority = 200,
450+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
451+ .base.cra_module = THIS_MODULE,
452+
453+ .digestsize = BLAKE2S_128_HASH_SIZE,
454+ .setkey = crypto_blake2s_setkey,
455+ .init = crypto_blake2s_init,
456+ .update = crypto_blake2s_update,
457+ .final = crypto_blake2s_final,
458+ .descsize = sizeof(struct blake2s_state),
459+}, {
460+ .base.cra_name = "blake2s-160",
461+ .base.cra_driver_name = "blake2s-160-x86",
462+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
463+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
464+ .base.cra_priority = 200,
465+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
466+ .base.cra_module = THIS_MODULE,
467+
468+ .digestsize = BLAKE2S_160_HASH_SIZE,
469+ .setkey = crypto_blake2s_setkey,
470+ .init = crypto_blake2s_init,
471+ .update = crypto_blake2s_update,
472+ .final = crypto_blake2s_final,
473+ .descsize = sizeof(struct blake2s_state),
474+}, {
475+ .base.cra_name = "blake2s-224",
476+ .base.cra_driver_name = "blake2s-224-x86",
477+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
478+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
479+ .base.cra_priority = 200,
480+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
481+ .base.cra_module = THIS_MODULE,
482+
483+ .digestsize = BLAKE2S_224_HASH_SIZE,
484+ .setkey = crypto_blake2s_setkey,
485+ .init = crypto_blake2s_init,
486+ .update = crypto_blake2s_update,
487+ .final = crypto_blake2s_final,
488+ .descsize = sizeof(struct blake2s_state),
489+}, {
490+ .base.cra_name = "blake2s-256",
491+ .base.cra_driver_name = "blake2s-256-x86",
492+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
493+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
494+ .base.cra_priority = 200,
495+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
496+ .base.cra_module = THIS_MODULE,
497+
498+ .digestsize = BLAKE2S_256_HASH_SIZE,
499+ .setkey = crypto_blake2s_setkey,
500+ .init = crypto_blake2s_init,
501+ .update = crypto_blake2s_update,
502+ .final = crypto_blake2s_final,
503+ .descsize = sizeof(struct blake2s_state),
504+}};
505+
506+static int __init blake2s_mod_init(void)
507+{
508+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
509+ return 0;
510+
511+ static_branch_enable(&blake2s_use_ssse3);
512+
513+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
514+ boot_cpu_has(X86_FEATURE_AVX) &&
515+ boot_cpu_has(X86_FEATURE_AVX2) &&
516+ boot_cpu_has(X86_FEATURE_AVX512F) &&
517+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
518+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
519+ XFEATURE_MASK_AVX512, NULL))
520+ static_branch_enable(&blake2s_use_avx512);
521+
522+ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
523+}
524+
525+static void __exit blake2s_mod_exit(void)
526+{
527+ if (boot_cpu_has(X86_FEATURE_SSSE3))
528+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
529+}
530+
531+module_init(blake2s_mod_init);
532+module_exit(blake2s_mod_exit);
533+
534+MODULE_ALIAS_CRYPTO("blake2s-128");
535+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
536+MODULE_ALIAS_CRYPTO("blake2s-160");
537+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
538+MODULE_ALIAS_CRYPTO("blake2s-224");
539+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
540+MODULE_ALIAS_CRYPTO("blake2s-256");
541+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
542+MODULE_LICENSE("GPL v2");
543--- a/crypto/Kconfig
544+++ b/crypto/Kconfig
545@@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
546
547 See https://blake2.net for further information.
548
549+config CRYPTO_BLAKE2S_X86
550+ tristate "BLAKE2s digest algorithm (x86 accelerated version)"
551+ depends on X86 && 64BIT
552+ select CRYPTO_LIB_BLAKE2S_GENERIC
553+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
554+
555 config CRYPTO_CRCT10DIF
556 tristate "CRCT10DIF algorithm"
557 select CRYPTO_HASH