ASR_BASE

Change-Id: Icf3719cc0afe3eeb3edc7fa80a2eb5199ca9dda1
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
new file mode 100644
index 0000000..0440558
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
@@ -0,0 +1,557 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:31 +0100
+Subject: [PATCH] crypto: blake2s - x86_64 SIMD implementation
+
+commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
+
+These implementations from Samuel Neves support AVX and AVX-512VL.
+Originally this used AVX-512F, but Skylake thermal throttling made
+AVX-512VL more attractive and possible to do with negligable difference.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/Makefile       |   2 +
+ arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
+ arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
+ crypto/Kconfig                 |   6 +
+ 4 files changed, 499 insertions(+)
+ create mode 100644 arch/x86/crypto/blake2s-core.S
+ create mode 100644 arch/x86/crypto/blake2s-glue.c
+
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
+ 	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+ 	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+ 	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
++	obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
+ endif
+ 
+ # These modules require assembler to support AVX2.
+@@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
+ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+ 
+ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
++blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+ 
+ ifeq ($(avx_supported),yes)
+ 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+--- /dev/null
++++ b/arch/x86/crypto/blake2s-core.S
+@@ -0,0 +1,258 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
++ */
++
++#include <linux/linkage.h>
++
++.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
++.align 32
++IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
++	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
++.section .rodata.cst16.ROT16, "aM", @progbits, 16
++.align 16
++ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
++.section .rodata.cst16.ROR328, "aM", @progbits, 16
++.align 16
++ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
++.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
++.align 64
++SIGMA:
++.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
++.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
++.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
++.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
++.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
++.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
++.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
++.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
++.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
++.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
++#ifdef CONFIG_AS_AVX512
++.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
++.align 64
++SIGMA2:
++.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
++.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
++.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
++.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
++.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
++.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
++.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
++.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
++.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
++.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
++#endif /* CONFIG_AS_AVX512 */
++
++.text
++#ifdef CONFIG_AS_SSSE3
++ENTRY(blake2s_compress_ssse3)
++	testq		%rdx,%rdx
++	je		.Lendofloop
++	movdqu		(%rdi),%xmm0
++	movdqu		0x10(%rdi),%xmm1
++	movdqa		ROT16(%rip),%xmm12
++	movdqa		ROR328(%rip),%xmm13
++	movdqu		0x20(%rdi),%xmm14
++	movq		%rcx,%xmm15
++	leaq		SIGMA+0xa0(%rip),%r8
++	jmp		.Lbeginofloop
++	.align		32
++.Lbeginofloop:
++	movdqa		%xmm0,%xmm10
++	movdqa		%xmm1,%xmm11
++	paddq		%xmm15,%xmm14
++	movdqa		IV(%rip),%xmm2
++	movdqa		%xmm14,%xmm3
++	pxor		IV+0x10(%rip),%xmm3
++	leaq		SIGMA(%rip),%rcx
++.Lroundloop:
++	movzbl		(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm4
++	movzbl		0x1(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm5
++	movzbl		0x2(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm6
++	movzbl		0x3(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm7
++	punpckldq	%xmm5,%xmm4
++	punpckldq	%xmm7,%xmm6
++	punpcklqdq	%xmm6,%xmm4
++	paddd		%xmm4,%xmm0
++	paddd		%xmm1,%xmm0
++	pxor		%xmm0,%xmm3
++	pshufb		%xmm12,%xmm3
++	paddd		%xmm3,%xmm2
++	pxor		%xmm2,%xmm1
++	movdqa		%xmm1,%xmm8
++	psrld		$0xc,%xmm1
++	pslld		$0x14,%xmm8
++	por		%xmm8,%xmm1
++	movzbl		0x4(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm5
++	movzbl		0x5(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm6
++	movzbl		0x6(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm7
++	movzbl		0x7(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm4
++	punpckldq	%xmm6,%xmm5
++	punpckldq	%xmm4,%xmm7
++	punpcklqdq	%xmm7,%xmm5
++	paddd		%xmm5,%xmm0
++	paddd		%xmm1,%xmm0
++	pxor		%xmm0,%xmm3
++	pshufb		%xmm13,%xmm3
++	paddd		%xmm3,%xmm2
++	pxor		%xmm2,%xmm1
++	movdqa		%xmm1,%xmm8
++	psrld		$0x7,%xmm1
++	pslld		$0x19,%xmm8
++	por		%xmm8,%xmm1
++	pshufd		$0x93,%xmm0,%xmm0
++	pshufd		$0x4e,%xmm3,%xmm3
++	pshufd		$0x39,%xmm2,%xmm2
++	movzbl		0x8(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm6
++	movzbl		0x9(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm7
++	movzbl		0xa(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm4
++	movzbl		0xb(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm5
++	punpckldq	%xmm7,%xmm6
++	punpckldq	%xmm5,%xmm4
++	punpcklqdq	%xmm4,%xmm6
++	paddd		%xmm6,%xmm0
++	paddd		%xmm1,%xmm0
++	pxor		%xmm0,%xmm3
++	pshufb		%xmm12,%xmm3
++	paddd		%xmm3,%xmm2
++	pxor		%xmm2,%xmm1
++	movdqa		%xmm1,%xmm8
++	psrld		$0xc,%xmm1
++	pslld		$0x14,%xmm8
++	por		%xmm8,%xmm1
++	movzbl		0xc(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm7
++	movzbl		0xd(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm4
++	movzbl		0xe(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm5
++	movzbl		0xf(%rcx),%eax
++	movd		(%rsi,%rax,4),%xmm6
++	punpckldq	%xmm4,%xmm7
++	punpckldq	%xmm6,%xmm5
++	punpcklqdq	%xmm5,%xmm7
++	paddd		%xmm7,%xmm0
++	paddd		%xmm1,%xmm0
++	pxor		%xmm0,%xmm3
++	pshufb		%xmm13,%xmm3
++	paddd		%xmm3,%xmm2
++	pxor		%xmm2,%xmm1
++	movdqa		%xmm1,%xmm8
++	psrld		$0x7,%xmm1
++	pslld		$0x19,%xmm8
++	por		%xmm8,%xmm1
++	pshufd		$0x39,%xmm0,%xmm0
++	pshufd		$0x4e,%xmm3,%xmm3
++	pshufd		$0x93,%xmm2,%xmm2
++	addq		$0x10,%rcx
++	cmpq		%r8,%rcx
++	jnz		.Lroundloop
++	pxor		%xmm2,%xmm0
++	pxor		%xmm3,%xmm1
++	pxor		%xmm10,%xmm0
++	pxor		%xmm11,%xmm1
++	addq		$0x40,%rsi
++	decq		%rdx
++	jnz		.Lbeginofloop
++	movdqu		%xmm0,(%rdi)
++	movdqu		%xmm1,0x10(%rdi)
++	movdqu		%xmm14,0x20(%rdi)
++.Lendofloop:
++	ret
++ENDPROC(blake2s_compress_ssse3)
++#endif /* CONFIG_AS_SSSE3 */
++
++#ifdef CONFIG_AS_AVX512
++ENTRY(blake2s_compress_avx512)
++	vmovdqu		(%rdi),%xmm0
++	vmovdqu		0x10(%rdi),%xmm1
++	vmovdqu		0x20(%rdi),%xmm4
++	vmovq		%rcx,%xmm5
++	vmovdqa		IV(%rip),%xmm14
++	vmovdqa		IV+16(%rip),%xmm15
++	jmp		.Lblake2s_compress_avx512_mainloop
++.align 32
++.Lblake2s_compress_avx512_mainloop:
++	vmovdqa		%xmm0,%xmm10
++	vmovdqa		%xmm1,%xmm11
++	vpaddq		%xmm5,%xmm4,%xmm4
++	vmovdqa		%xmm14,%xmm2
++	vpxor		%xmm15,%xmm4,%xmm3
++	vmovdqu		(%rsi),%ymm6
++	vmovdqu		0x20(%rsi),%ymm7
++	addq		$0x40,%rsi
++	leaq		SIGMA2(%rip),%rax
++	movb		$0xa,%cl
++.Lblake2s_compress_avx512_roundloop:
++	addq		$0x40,%rax
++	vmovdqa		-0x40(%rax),%ymm8
++	vmovdqa		-0x20(%rax),%ymm9
++	vpermi2d	%ymm7,%ymm6,%ymm8
++	vpermi2d	%ymm7,%ymm6,%ymm9
++	vmovdqa		%ymm8,%ymm6
++	vmovdqa		%ymm9,%ymm7
++	vpaddd		%xmm8,%xmm0,%xmm0
++	vpaddd		%xmm1,%xmm0,%xmm0
++	vpxor		%xmm0,%xmm3,%xmm3
++	vprord		$0x10,%xmm3,%xmm3
++	vpaddd		%xmm3,%xmm2,%xmm2
++	vpxor		%xmm2,%xmm1,%xmm1
++	vprord		$0xc,%xmm1,%xmm1
++	vextracti128	$0x1,%ymm8,%xmm8
++	vpaddd		%xmm8,%xmm0,%xmm0
++	vpaddd		%xmm1,%xmm0,%xmm0
++	vpxor		%xmm0,%xmm3,%xmm3
++	vprord		$0x8,%xmm3,%xmm3
++	vpaddd		%xmm3,%xmm2,%xmm2
++	vpxor		%xmm2,%xmm1,%xmm1
++	vprord		$0x7,%xmm1,%xmm1
++	vpshufd		$0x93,%xmm0,%xmm0
++	vpshufd		$0x4e,%xmm3,%xmm3
++	vpshufd		$0x39,%xmm2,%xmm2
++	vpaddd		%xmm9,%xmm0,%xmm0
++	vpaddd		%xmm1,%xmm0,%xmm0
++	vpxor		%xmm0,%xmm3,%xmm3
++	vprord		$0x10,%xmm3,%xmm3
++	vpaddd		%xmm3,%xmm2,%xmm2
++	vpxor		%xmm2,%xmm1,%xmm1
++	vprord		$0xc,%xmm1,%xmm1
++	vextracti128	$0x1,%ymm9,%xmm9
++	vpaddd		%xmm9,%xmm0,%xmm0
++	vpaddd		%xmm1,%xmm0,%xmm0
++	vpxor		%xmm0,%xmm3,%xmm3
++	vprord		$0x8,%xmm3,%xmm3
++	vpaddd		%xmm3,%xmm2,%xmm2
++	vpxor		%xmm2,%xmm1,%xmm1
++	vprord		$0x7,%xmm1,%xmm1
++	vpshufd		$0x39,%xmm0,%xmm0
++	vpshufd		$0x4e,%xmm3,%xmm3
++	vpshufd		$0x93,%xmm2,%xmm2
++	decb		%cl
++	jne		.Lblake2s_compress_avx512_roundloop
++	vpxor		%xmm10,%xmm0,%xmm0
++	vpxor		%xmm11,%xmm1,%xmm1
++	vpxor		%xmm2,%xmm0,%xmm0
++	vpxor		%xmm3,%xmm1,%xmm1
++	decq		%rdx
++	jne		.Lblake2s_compress_avx512_mainloop
++	vmovdqu		%xmm0,(%rdi)
++	vmovdqu		%xmm1,0x10(%rdi)
++	vmovdqu		%xmm4,0x20(%rdi)
++	vzeroupper
++	retq
++ENDPROC(blake2s_compress_avx512)
++#endif /* CONFIG_AS_AVX512 */
+--- /dev/null
++++ b/arch/x86/crypto/blake2s-glue.c
+@@ -0,0 +1,233 @@
++// SPDX-License-Identifier: GPL-2.0 OR MIT
++/*
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#include <crypto/internal/blake2s.h>
++#include <crypto/internal/simd.h>
++#include <crypto/internal/hash.h>
++
++#include <linux/types.h>
++#include <linux/jump_label.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <asm/cpufeature.h>
++#include <asm/fpu/api.h>
++#include <asm/processor.h>
++#include <asm/simd.h>
++
++asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
++				       const u8 *block, const size_t nblocks,
++				       const u32 inc);
++asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
++					const u8 *block, const size_t nblocks,
++					const u32 inc);
++
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
++static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
++
++void blake2s_compress_arch(struct blake2s_state *state,
++			   const u8 *block, size_t nblocks,
++			   const u32 inc)
++{
++	/* SIMD disables preemption, so relax after processing each page. */
++	BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
++
++	if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
++		blake2s_compress_generic(state, block, nblocks, inc);
++		return;
++	}
++
++	for (;;) {
++		const size_t blocks = min_t(size_t, nblocks,
++					    PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
++
++		kernel_fpu_begin();
++		if (IS_ENABLED(CONFIG_AS_AVX512) &&
++		    static_branch_likely(&blake2s_use_avx512))
++			blake2s_compress_avx512(state, block, blocks, inc);
++		else
++			blake2s_compress_ssse3(state, block, blocks, inc);
++		kernel_fpu_end();
++
++		nblocks -= blocks;
++		if (!nblocks)
++			break;
++		block += blocks * BLAKE2S_BLOCK_SIZE;
++	}
++}
++EXPORT_SYMBOL(blake2s_compress_arch);
++
++static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
++				 unsigned int keylen)
++{
++	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
++
++	if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
++		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
++		return -EINVAL;
++	}
++
++	memcpy(tctx->key, key, keylen);
++	tctx->keylen = keylen;
++
++	return 0;
++}
++
++static int crypto_blake2s_init(struct shash_desc *desc)
++{
++	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
++	struct blake2s_state *state = shash_desc_ctx(desc);
++	const int outlen = crypto_shash_digestsize(desc->tfm);
++
++	if (tctx->keylen)
++		blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
++	else
++		blake2s_init(state, outlen);
++
++	return 0;
++}
++
++static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
++				 unsigned int inlen)
++{
++	struct blake2s_state *state = shash_desc_ctx(desc);
++	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
++
++	if (unlikely(!inlen))
++		return 0;
++	if (inlen > fill) {
++		memcpy(state->buf + state->buflen, in, fill);
++		blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
++		state->buflen = 0;
++		in += fill;
++		inlen -= fill;
++	}
++	if (inlen > BLAKE2S_BLOCK_SIZE) {
++		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
++		/* Hash one less (full) block than strictly possible */
++		blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
++		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++	}
++	memcpy(state->buf + state->buflen, in, inlen);
++	state->buflen += inlen;
++
++	return 0;
++}
++
++static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
++{
++	struct blake2s_state *state = shash_desc_ctx(desc);
++
++	blake2s_set_lastblock(state);
++	memset(state->buf + state->buflen, 0,
++	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
++	blake2s_compress_arch(state, state->buf, 1, state->buflen);
++	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
++	memcpy(out, state->h, state->outlen);
++	memzero_explicit(state, sizeof(*state));
++
++	return 0;
++}
++
++static struct shash_alg blake2s_algs[] = {{
++	.base.cra_name		= "blake2s-128",
++	.base.cra_driver_name	= "blake2s-128-x86",
++	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
++	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
++	.base.cra_priority	= 200,
++	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++	.base.cra_module	= THIS_MODULE,
++
++	.digestsize		= BLAKE2S_128_HASH_SIZE,
++	.setkey			= crypto_blake2s_setkey,
++	.init			= crypto_blake2s_init,
++	.update			= crypto_blake2s_update,
++	.final			= crypto_blake2s_final,
++	.descsize		= sizeof(struct blake2s_state),
++}, {
++	.base.cra_name		= "blake2s-160",
++	.base.cra_driver_name	= "blake2s-160-x86",
++	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
++	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
++	.base.cra_priority	= 200,
++	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++	.base.cra_module	= THIS_MODULE,
++
++	.digestsize		= BLAKE2S_160_HASH_SIZE,
++	.setkey			= crypto_blake2s_setkey,
++	.init			= crypto_blake2s_init,
++	.update			= crypto_blake2s_update,
++	.final			= crypto_blake2s_final,
++	.descsize		= sizeof(struct blake2s_state),
++}, {
++	.base.cra_name		= "blake2s-224",
++	.base.cra_driver_name	= "blake2s-224-x86",
++	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
++	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
++	.base.cra_priority	= 200,
++	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++	.base.cra_module	= THIS_MODULE,
++
++	.digestsize		= BLAKE2S_224_HASH_SIZE,
++	.setkey			= crypto_blake2s_setkey,
++	.init			= crypto_blake2s_init,
++	.update			= crypto_blake2s_update,
++	.final			= crypto_blake2s_final,
++	.descsize		= sizeof(struct blake2s_state),
++}, {
++	.base.cra_name		= "blake2s-256",
++	.base.cra_driver_name	= "blake2s-256-x86",
++	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
++	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
++	.base.cra_priority	= 200,
++	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
++	.base.cra_module	= THIS_MODULE,
++
++	.digestsize		= BLAKE2S_256_HASH_SIZE,
++	.setkey			= crypto_blake2s_setkey,
++	.init			= crypto_blake2s_init,
++	.update			= crypto_blake2s_update,
++	.final			= crypto_blake2s_final,
++	.descsize		= sizeof(struct blake2s_state),
++}};
++
++static int __init blake2s_mod_init(void)
++{
++	if (!boot_cpu_has(X86_FEATURE_SSSE3))
++		return 0;
++
++	static_branch_enable(&blake2s_use_ssse3);
++
++	if (IS_ENABLED(CONFIG_AS_AVX512) &&
++	    boot_cpu_has(X86_FEATURE_AVX) &&
++	    boot_cpu_has(X86_FEATURE_AVX2) &&
++	    boot_cpu_has(X86_FEATURE_AVX512F) &&
++	    boot_cpu_has(X86_FEATURE_AVX512VL) &&
++	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
++			      XFEATURE_MASK_AVX512, NULL))
++		static_branch_enable(&blake2s_use_avx512);
++
++	return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++static void __exit blake2s_mod_exit(void)
++{
++	if (boot_cpu_has(X86_FEATURE_SSSE3))
++		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
++}
++
++module_init(blake2s_mod_init);
++module_exit(blake2s_mod_exit);
++
++MODULE_ALIAS_CRYPTO("blake2s-128");
++MODULE_ALIAS_CRYPTO("blake2s-128-x86");
++MODULE_ALIAS_CRYPTO("blake2s-160");
++MODULE_ALIAS_CRYPTO("blake2s-160-x86");
++MODULE_ALIAS_CRYPTO("blake2s-224");
++MODULE_ALIAS_CRYPTO("blake2s-224-x86");
++MODULE_ALIAS_CRYPTO("blake2s-256");
++MODULE_ALIAS_CRYPTO("blake2s-256-x86");
++MODULE_LICENSE("GPL v2");
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
+ 
+ 	  See https://blake2.net for further information.
+ 
++config CRYPTO_BLAKE2S_X86
++	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
++	depends on X86 && 64BIT
++	select CRYPTO_LIB_BLAKE2S_GENERIC
++	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
++
+ config CRYPTO_CRCT10DIF
+ 	tristate "CRCT10DIF algorithm"
+ 	select CRYPTO_HASH