b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | Date: Fri, 8 Nov 2019 13:22:31 +0100 |
| 4 | Subject: [PATCH] crypto: blake2s - x86_64 SIMD implementation |
| 5 | |
| 6 | commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream. |
| 7 | |
| 8 | These implementations from Samuel Neves support AVX and AVX-512VL. |
| 9 | Originally this used AVX-512F, but Skylake thermal throttling made |
| 10 | AVX-512VL more attractive and possible to do with negligable difference. |
| 11 | |
| 12 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 13 | Signed-off-by: Samuel Neves <sneves@dei.uc.pt> |
| 14 | Co-developed-by: Samuel Neves <sneves@dei.uc.pt> |
| 15 | [ardb: move to arch/x86/crypto, wire into lib/crypto framework] |
| 16 | Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| 17 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 18 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 19 | --- |
| 20 | arch/x86/crypto/Makefile | 2 + |
| 21 | arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++ |
| 22 | arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++ |
| 23 | crypto/Kconfig | 6 + |
| 24 | 4 files changed, 499 insertions(+) |
| 25 | create mode 100644 arch/x86/crypto/blake2s-core.S |
| 26 | create mode 100644 arch/x86/crypto/blake2s-glue.c |
| 27 | |
| 28 | --- a/arch/x86/crypto/Makefile |
| 29 | +++ b/arch/x86/crypto/Makefile |
| 30 | @@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes) |
| 31 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o |
| 32 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o |
| 33 | obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o |
| 34 | + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o |
| 35 | endif |
| 36 | |
| 37 | # These modules require assembler to support AVX2. |
| 38 | @@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8 |
| 39 | aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o |
| 40 | |
| 41 | nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o |
| 42 | +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o |
| 43 | |
| 44 | ifeq ($(avx_supported),yes) |
| 45 | camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ |
| 46 | --- /dev/null |
| 47 | +++ b/arch/x86/crypto/blake2s-core.S |
| 48 | @@ -0,0 +1,258 @@ |
| 49 | +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ |
| 50 | +/* |
| 51 | + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 52 | + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. |
| 53 | + */ |
| 54 | + |
| 55 | +#include <linux/linkage.h> |
| 56 | + |
| 57 | +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 |
| 58 | +.align 32 |
| 59 | +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 |
| 60 | + .octa 0x5BE0CD191F83D9AB9B05688C510E527F |
| 61 | +.section .rodata.cst16.ROT16, "aM", @progbits, 16 |
| 62 | +.align 16 |
| 63 | +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 |
| 64 | +.section .rodata.cst16.ROR328, "aM", @progbits, 16 |
| 65 | +.align 16 |
| 66 | +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 |
| 67 | +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 |
| 68 | +.align 64 |
| 69 | +SIGMA: |
| 70 | +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 |
| 71 | +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 |
| 72 | +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 |
| 73 | +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 |
| 74 | +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 |
| 75 | +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 |
| 76 | +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 |
| 77 | +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 |
| 78 | +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 |
| 79 | +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 |
| 80 | +#ifdef CONFIG_AS_AVX512 |
| 81 | +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 |
| 82 | +.align 64 |
| 83 | +SIGMA2: |
| 84 | +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 |
| 85 | +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 |
| 86 | +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 |
| 87 | +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 |
| 88 | +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 |
| 89 | +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 |
| 90 | +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 |
| 91 | +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 |
| 92 | +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 |
| 93 | +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 |
| 94 | +#endif /* CONFIG_AS_AVX512 */ |
| 95 | + |
| 96 | +.text |
| 97 | +#ifdef CONFIG_AS_SSSE3 |
| 98 | +ENTRY(blake2s_compress_ssse3) |
| 99 | + testq %rdx,%rdx |
| 100 | + je .Lendofloop |
| 101 | + movdqu (%rdi),%xmm0 |
| 102 | + movdqu 0x10(%rdi),%xmm1 |
| 103 | + movdqa ROT16(%rip),%xmm12 |
| 104 | + movdqa ROR328(%rip),%xmm13 |
| 105 | + movdqu 0x20(%rdi),%xmm14 |
| 106 | + movq %rcx,%xmm15 |
| 107 | + leaq SIGMA+0xa0(%rip),%r8 |
| 108 | + jmp .Lbeginofloop |
| 109 | + .align 32 |
| 110 | +.Lbeginofloop: |
| 111 | + movdqa %xmm0,%xmm10 |
| 112 | + movdqa %xmm1,%xmm11 |
| 113 | + paddq %xmm15,%xmm14 |
| 114 | + movdqa IV(%rip),%xmm2 |
| 115 | + movdqa %xmm14,%xmm3 |
| 116 | + pxor IV+0x10(%rip),%xmm3 |
| 117 | + leaq SIGMA(%rip),%rcx |
| 118 | +.Lroundloop: |
| 119 | + movzbl (%rcx),%eax |
| 120 | + movd (%rsi,%rax,4),%xmm4 |
| 121 | + movzbl 0x1(%rcx),%eax |
| 122 | + movd (%rsi,%rax,4),%xmm5 |
| 123 | + movzbl 0x2(%rcx),%eax |
| 124 | + movd (%rsi,%rax,4),%xmm6 |
| 125 | + movzbl 0x3(%rcx),%eax |
| 126 | + movd (%rsi,%rax,4),%xmm7 |
| 127 | + punpckldq %xmm5,%xmm4 |
| 128 | + punpckldq %xmm7,%xmm6 |
| 129 | + punpcklqdq %xmm6,%xmm4 |
| 130 | + paddd %xmm4,%xmm0 |
| 131 | + paddd %xmm1,%xmm0 |
| 132 | + pxor %xmm0,%xmm3 |
| 133 | + pshufb %xmm12,%xmm3 |
| 134 | + paddd %xmm3,%xmm2 |
| 135 | + pxor %xmm2,%xmm1 |
| 136 | + movdqa %xmm1,%xmm8 |
| 137 | + psrld $0xc,%xmm1 |
| 138 | + pslld $0x14,%xmm8 |
| 139 | + por %xmm8,%xmm1 |
| 140 | + movzbl 0x4(%rcx),%eax |
| 141 | + movd (%rsi,%rax,4),%xmm5 |
| 142 | + movzbl 0x5(%rcx),%eax |
| 143 | + movd (%rsi,%rax,4),%xmm6 |
| 144 | + movzbl 0x6(%rcx),%eax |
| 145 | + movd (%rsi,%rax,4),%xmm7 |
| 146 | + movzbl 0x7(%rcx),%eax |
| 147 | + movd (%rsi,%rax,4),%xmm4 |
| 148 | + punpckldq %xmm6,%xmm5 |
| 149 | + punpckldq %xmm4,%xmm7 |
| 150 | + punpcklqdq %xmm7,%xmm5 |
| 151 | + paddd %xmm5,%xmm0 |
| 152 | + paddd %xmm1,%xmm0 |
| 153 | + pxor %xmm0,%xmm3 |
| 154 | + pshufb %xmm13,%xmm3 |
| 155 | + paddd %xmm3,%xmm2 |
| 156 | + pxor %xmm2,%xmm1 |
| 157 | + movdqa %xmm1,%xmm8 |
| 158 | + psrld $0x7,%xmm1 |
| 159 | + pslld $0x19,%xmm8 |
| 160 | + por %xmm8,%xmm1 |
| 161 | + pshufd $0x93,%xmm0,%xmm0 |
| 162 | + pshufd $0x4e,%xmm3,%xmm3 |
| 163 | + pshufd $0x39,%xmm2,%xmm2 |
| 164 | + movzbl 0x8(%rcx),%eax |
| 165 | + movd (%rsi,%rax,4),%xmm6 |
| 166 | + movzbl 0x9(%rcx),%eax |
| 167 | + movd (%rsi,%rax,4),%xmm7 |
| 168 | + movzbl 0xa(%rcx),%eax |
| 169 | + movd (%rsi,%rax,4),%xmm4 |
| 170 | + movzbl 0xb(%rcx),%eax |
| 171 | + movd (%rsi,%rax,4),%xmm5 |
| 172 | + punpckldq %xmm7,%xmm6 |
| 173 | + punpckldq %xmm5,%xmm4 |
| 174 | + punpcklqdq %xmm4,%xmm6 |
| 175 | + paddd %xmm6,%xmm0 |
| 176 | + paddd %xmm1,%xmm0 |
| 177 | + pxor %xmm0,%xmm3 |
| 178 | + pshufb %xmm12,%xmm3 |
| 179 | + paddd %xmm3,%xmm2 |
| 180 | + pxor %xmm2,%xmm1 |
| 181 | + movdqa %xmm1,%xmm8 |
| 182 | + psrld $0xc,%xmm1 |
| 183 | + pslld $0x14,%xmm8 |
| 184 | + por %xmm8,%xmm1 |
| 185 | + movzbl 0xc(%rcx),%eax |
| 186 | + movd (%rsi,%rax,4),%xmm7 |
| 187 | + movzbl 0xd(%rcx),%eax |
| 188 | + movd (%rsi,%rax,4),%xmm4 |
| 189 | + movzbl 0xe(%rcx),%eax |
| 190 | + movd (%rsi,%rax,4),%xmm5 |
| 191 | + movzbl 0xf(%rcx),%eax |
| 192 | + movd (%rsi,%rax,4),%xmm6 |
| 193 | + punpckldq %xmm4,%xmm7 |
| 194 | + punpckldq %xmm6,%xmm5 |
| 195 | + punpcklqdq %xmm5,%xmm7 |
| 196 | + paddd %xmm7,%xmm0 |
| 197 | + paddd %xmm1,%xmm0 |
| 198 | + pxor %xmm0,%xmm3 |
| 199 | + pshufb %xmm13,%xmm3 |
| 200 | + paddd %xmm3,%xmm2 |
| 201 | + pxor %xmm2,%xmm1 |
| 202 | + movdqa %xmm1,%xmm8 |
| 203 | + psrld $0x7,%xmm1 |
| 204 | + pslld $0x19,%xmm8 |
| 205 | + por %xmm8,%xmm1 |
| 206 | + pshufd $0x39,%xmm0,%xmm0 |
| 207 | + pshufd $0x4e,%xmm3,%xmm3 |
| 208 | + pshufd $0x93,%xmm2,%xmm2 |
| 209 | + addq $0x10,%rcx |
| 210 | + cmpq %r8,%rcx |
| 211 | + jnz .Lroundloop |
| 212 | + pxor %xmm2,%xmm0 |
| 213 | + pxor %xmm3,%xmm1 |
| 214 | + pxor %xmm10,%xmm0 |
| 215 | + pxor %xmm11,%xmm1 |
| 216 | + addq $0x40,%rsi |
| 217 | + decq %rdx |
| 218 | + jnz .Lbeginofloop |
| 219 | + movdqu %xmm0,(%rdi) |
| 220 | + movdqu %xmm1,0x10(%rdi) |
| 221 | + movdqu %xmm14,0x20(%rdi) |
| 222 | +.Lendofloop: |
| 223 | + ret |
| 224 | +ENDPROC(blake2s_compress_ssse3) |
| 225 | +#endif /* CONFIG_AS_SSSE3 */ |
| 226 | + |
| 227 | +#ifdef CONFIG_AS_AVX512 |
| 228 | +ENTRY(blake2s_compress_avx512) |
| 229 | + vmovdqu (%rdi),%xmm0 |
| 230 | + vmovdqu 0x10(%rdi),%xmm1 |
| 231 | + vmovdqu 0x20(%rdi),%xmm4 |
| 232 | + vmovq %rcx,%xmm5 |
| 233 | + vmovdqa IV(%rip),%xmm14 |
| 234 | + vmovdqa IV+16(%rip),%xmm15 |
| 235 | + jmp .Lblake2s_compress_avx512_mainloop |
| 236 | +.align 32 |
| 237 | +.Lblake2s_compress_avx512_mainloop: |
| 238 | + vmovdqa %xmm0,%xmm10 |
| 239 | + vmovdqa %xmm1,%xmm11 |
| 240 | + vpaddq %xmm5,%xmm4,%xmm4 |
| 241 | + vmovdqa %xmm14,%xmm2 |
| 242 | + vpxor %xmm15,%xmm4,%xmm3 |
| 243 | + vmovdqu (%rsi),%ymm6 |
| 244 | + vmovdqu 0x20(%rsi),%ymm7 |
| 245 | + addq $0x40,%rsi |
| 246 | + leaq SIGMA2(%rip),%rax |
| 247 | + movb $0xa,%cl |
| 248 | +.Lblake2s_compress_avx512_roundloop: |
| 249 | + addq $0x40,%rax |
| 250 | + vmovdqa -0x40(%rax),%ymm8 |
| 251 | + vmovdqa -0x20(%rax),%ymm9 |
| 252 | + vpermi2d %ymm7,%ymm6,%ymm8 |
| 253 | + vpermi2d %ymm7,%ymm6,%ymm9 |
| 254 | + vmovdqa %ymm8,%ymm6 |
| 255 | + vmovdqa %ymm9,%ymm7 |
| 256 | + vpaddd %xmm8,%xmm0,%xmm0 |
| 257 | + vpaddd %xmm1,%xmm0,%xmm0 |
| 258 | + vpxor %xmm0,%xmm3,%xmm3 |
| 259 | + vprord $0x10,%xmm3,%xmm3 |
| 260 | + vpaddd %xmm3,%xmm2,%xmm2 |
| 261 | + vpxor %xmm2,%xmm1,%xmm1 |
| 262 | + vprord $0xc,%xmm1,%xmm1 |
| 263 | + vextracti128 $0x1,%ymm8,%xmm8 |
| 264 | + vpaddd %xmm8,%xmm0,%xmm0 |
| 265 | + vpaddd %xmm1,%xmm0,%xmm0 |
| 266 | + vpxor %xmm0,%xmm3,%xmm3 |
| 267 | + vprord $0x8,%xmm3,%xmm3 |
| 268 | + vpaddd %xmm3,%xmm2,%xmm2 |
| 269 | + vpxor %xmm2,%xmm1,%xmm1 |
| 270 | + vprord $0x7,%xmm1,%xmm1 |
| 271 | + vpshufd $0x93,%xmm0,%xmm0 |
| 272 | + vpshufd $0x4e,%xmm3,%xmm3 |
| 273 | + vpshufd $0x39,%xmm2,%xmm2 |
| 274 | + vpaddd %xmm9,%xmm0,%xmm0 |
| 275 | + vpaddd %xmm1,%xmm0,%xmm0 |
| 276 | + vpxor %xmm0,%xmm3,%xmm3 |
| 277 | + vprord $0x10,%xmm3,%xmm3 |
| 278 | + vpaddd %xmm3,%xmm2,%xmm2 |
| 279 | + vpxor %xmm2,%xmm1,%xmm1 |
| 280 | + vprord $0xc,%xmm1,%xmm1 |
| 281 | + vextracti128 $0x1,%ymm9,%xmm9 |
| 282 | + vpaddd %xmm9,%xmm0,%xmm0 |
| 283 | + vpaddd %xmm1,%xmm0,%xmm0 |
| 284 | + vpxor %xmm0,%xmm3,%xmm3 |
| 285 | + vprord $0x8,%xmm3,%xmm3 |
| 286 | + vpaddd %xmm3,%xmm2,%xmm2 |
| 287 | + vpxor %xmm2,%xmm1,%xmm1 |
| 288 | + vprord $0x7,%xmm1,%xmm1 |
| 289 | + vpshufd $0x39,%xmm0,%xmm0 |
| 290 | + vpshufd $0x4e,%xmm3,%xmm3 |
| 291 | + vpshufd $0x93,%xmm2,%xmm2 |
| 292 | + decb %cl |
| 293 | + jne .Lblake2s_compress_avx512_roundloop |
| 294 | + vpxor %xmm10,%xmm0,%xmm0 |
| 295 | + vpxor %xmm11,%xmm1,%xmm1 |
| 296 | + vpxor %xmm2,%xmm0,%xmm0 |
| 297 | + vpxor %xmm3,%xmm1,%xmm1 |
| 298 | + decq %rdx |
| 299 | + jne .Lblake2s_compress_avx512_mainloop |
| 300 | + vmovdqu %xmm0,(%rdi) |
| 301 | + vmovdqu %xmm1,0x10(%rdi) |
| 302 | + vmovdqu %xmm4,0x20(%rdi) |
| 303 | + vzeroupper |
| 304 | + retq |
| 305 | +ENDPROC(blake2s_compress_avx512) |
| 306 | +#endif /* CONFIG_AS_AVX512 */ |
| 307 | --- /dev/null |
| 308 | +++ b/arch/x86/crypto/blake2s-glue.c |
| 309 | @@ -0,0 +1,233 @@ |
| 310 | +// SPDX-License-Identifier: GPL-2.0 OR MIT |
| 311 | +/* |
| 312 | + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 313 | + */ |
| 314 | + |
| 315 | +#include <crypto/internal/blake2s.h> |
| 316 | +#include <crypto/internal/simd.h> |
| 317 | +#include <crypto/internal/hash.h> |
| 318 | + |
| 319 | +#include <linux/types.h> |
| 320 | +#include <linux/jump_label.h> |
| 321 | +#include <linux/kernel.h> |
| 322 | +#include <linux/module.h> |
| 323 | + |
| 324 | +#include <asm/cpufeature.h> |
| 325 | +#include <asm/fpu/api.h> |
| 326 | +#include <asm/processor.h> |
| 327 | +#include <asm/simd.h> |
| 328 | + |
| 329 | +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, |
| 330 | + const u8 *block, const size_t nblocks, |
| 331 | + const u32 inc); |
| 332 | +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, |
| 333 | + const u8 *block, const size_t nblocks, |
| 334 | + const u32 inc); |
| 335 | + |
| 336 | +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); |
| 337 | +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); |
| 338 | + |
| 339 | +void blake2s_compress_arch(struct blake2s_state *state, |
| 340 | + const u8 *block, size_t nblocks, |
| 341 | + const u32 inc) |
| 342 | +{ |
| 343 | + /* SIMD disables preemption, so relax after processing each page. */ |
| 344 | + BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); |
| 345 | + |
| 346 | + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { |
| 347 | + blake2s_compress_generic(state, block, nblocks, inc); |
| 348 | + return; |
| 349 | + } |
| 350 | + |
| 351 | + for (;;) { |
| 352 | + const size_t blocks = min_t(size_t, nblocks, |
| 353 | + PAGE_SIZE / BLAKE2S_BLOCK_SIZE); |
| 354 | + |
| 355 | + kernel_fpu_begin(); |
| 356 | + if (IS_ENABLED(CONFIG_AS_AVX512) && |
| 357 | + static_branch_likely(&blake2s_use_avx512)) |
| 358 | + blake2s_compress_avx512(state, block, blocks, inc); |
| 359 | + else |
| 360 | + blake2s_compress_ssse3(state, block, blocks, inc); |
| 361 | + kernel_fpu_end(); |
| 362 | + |
| 363 | + nblocks -= blocks; |
| 364 | + if (!nblocks) |
| 365 | + break; |
| 366 | + block += blocks * BLAKE2S_BLOCK_SIZE; |
| 367 | + } |
| 368 | +} |
| 369 | +EXPORT_SYMBOL(blake2s_compress_arch); |
| 370 | + |
| 371 | +static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, |
| 372 | + unsigned int keylen) |
| 373 | +{ |
| 374 | + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); |
| 375 | + |
| 376 | + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) { |
| 377 | + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); |
| 378 | + return -EINVAL; |
| 379 | + } |
| 380 | + |
| 381 | + memcpy(tctx->key, key, keylen); |
| 382 | + tctx->keylen = keylen; |
| 383 | + |
| 384 | + return 0; |
| 385 | +} |
| 386 | + |
| 387 | +static int crypto_blake2s_init(struct shash_desc *desc) |
| 388 | +{ |
| 389 | + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); |
| 390 | + struct blake2s_state *state = shash_desc_ctx(desc); |
| 391 | + const int outlen = crypto_shash_digestsize(desc->tfm); |
| 392 | + |
| 393 | + if (tctx->keylen) |
| 394 | + blake2s_init_key(state, outlen, tctx->key, tctx->keylen); |
| 395 | + else |
| 396 | + blake2s_init(state, outlen); |
| 397 | + |
| 398 | + return 0; |
| 399 | +} |
| 400 | + |
| 401 | +static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, |
| 402 | + unsigned int inlen) |
| 403 | +{ |
| 404 | + struct blake2s_state *state = shash_desc_ctx(desc); |
| 405 | + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; |
| 406 | + |
| 407 | + if (unlikely(!inlen)) |
| 408 | + return 0; |
| 409 | + if (inlen > fill) { |
| 410 | + memcpy(state->buf + state->buflen, in, fill); |
| 411 | + blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); |
| 412 | + state->buflen = 0; |
| 413 | + in += fill; |
| 414 | + inlen -= fill; |
| 415 | + } |
| 416 | + if (inlen > BLAKE2S_BLOCK_SIZE) { |
| 417 | + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); |
| 418 | + /* Hash one less (full) block than strictly possible */ |
| 419 | + blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); |
| 420 | + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); |
| 421 | + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); |
| 422 | + } |
| 423 | + memcpy(state->buf + state->buflen, in, inlen); |
| 424 | + state->buflen += inlen; |
| 425 | + |
| 426 | + return 0; |
| 427 | +} |
| 428 | + |
| 429 | +static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) |
| 430 | +{ |
| 431 | + struct blake2s_state *state = shash_desc_ctx(desc); |
| 432 | + |
| 433 | + blake2s_set_lastblock(state); |
| 434 | + memset(state->buf + state->buflen, 0, |
| 435 | + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ |
| 436 | + blake2s_compress_arch(state, state->buf, 1, state->buflen); |
| 437 | + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); |
| 438 | + memcpy(out, state->h, state->outlen); |
| 439 | + memzero_explicit(state, sizeof(*state)); |
| 440 | + |
| 441 | + return 0; |
| 442 | +} |
| 443 | + |
| 444 | +static struct shash_alg blake2s_algs[] = {{ |
| 445 | + .base.cra_name = "blake2s-128", |
| 446 | + .base.cra_driver_name = "blake2s-128-x86", |
| 447 | + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, |
| 448 | + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), |
| 449 | + .base.cra_priority = 200, |
| 450 | + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, |
| 451 | + .base.cra_module = THIS_MODULE, |
| 452 | + |
| 453 | + .digestsize = BLAKE2S_128_HASH_SIZE, |
| 454 | + .setkey = crypto_blake2s_setkey, |
| 455 | + .init = crypto_blake2s_init, |
| 456 | + .update = crypto_blake2s_update, |
| 457 | + .final = crypto_blake2s_final, |
| 458 | + .descsize = sizeof(struct blake2s_state), |
| 459 | +}, { |
| 460 | + .base.cra_name = "blake2s-160", |
| 461 | + .base.cra_driver_name = "blake2s-160-x86", |
| 462 | + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, |
| 463 | + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), |
| 464 | + .base.cra_priority = 200, |
| 465 | + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, |
| 466 | + .base.cra_module = THIS_MODULE, |
| 467 | + |
| 468 | + .digestsize = BLAKE2S_160_HASH_SIZE, |
| 469 | + .setkey = crypto_blake2s_setkey, |
| 470 | + .init = crypto_blake2s_init, |
| 471 | + .update = crypto_blake2s_update, |
| 472 | + .final = crypto_blake2s_final, |
| 473 | + .descsize = sizeof(struct blake2s_state), |
| 474 | +}, { |
| 475 | + .base.cra_name = "blake2s-224", |
| 476 | + .base.cra_driver_name = "blake2s-224-x86", |
| 477 | + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, |
| 478 | + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), |
| 479 | + .base.cra_priority = 200, |
| 480 | + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, |
| 481 | + .base.cra_module = THIS_MODULE, |
| 482 | + |
| 483 | + .digestsize = BLAKE2S_224_HASH_SIZE, |
| 484 | + .setkey = crypto_blake2s_setkey, |
| 485 | + .init = crypto_blake2s_init, |
| 486 | + .update = crypto_blake2s_update, |
| 487 | + .final = crypto_blake2s_final, |
| 488 | + .descsize = sizeof(struct blake2s_state), |
| 489 | +}, { |
| 490 | + .base.cra_name = "blake2s-256", |
| 491 | + .base.cra_driver_name = "blake2s-256-x86", |
| 492 | + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, |
| 493 | + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), |
| 494 | + .base.cra_priority = 200, |
| 495 | + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, |
| 496 | + .base.cra_module = THIS_MODULE, |
| 497 | + |
| 498 | + .digestsize = BLAKE2S_256_HASH_SIZE, |
| 499 | + .setkey = crypto_blake2s_setkey, |
| 500 | + .init = crypto_blake2s_init, |
| 501 | + .update = crypto_blake2s_update, |
| 502 | + .final = crypto_blake2s_final, |
| 503 | + .descsize = sizeof(struct blake2s_state), |
| 504 | +}}; |
| 505 | + |
| 506 | +static int __init blake2s_mod_init(void) |
| 507 | +{ |
| 508 | + if (!boot_cpu_has(X86_FEATURE_SSSE3)) |
| 509 | + return 0; |
| 510 | + |
| 511 | + static_branch_enable(&blake2s_use_ssse3); |
| 512 | + |
| 513 | + if (IS_ENABLED(CONFIG_AS_AVX512) && |
| 514 | + boot_cpu_has(X86_FEATURE_AVX) && |
| 515 | + boot_cpu_has(X86_FEATURE_AVX2) && |
| 516 | + boot_cpu_has(X86_FEATURE_AVX512F) && |
| 517 | + boot_cpu_has(X86_FEATURE_AVX512VL) && |
| 518 | + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | |
| 519 | + XFEATURE_MASK_AVX512, NULL)) |
| 520 | + static_branch_enable(&blake2s_use_avx512); |
| 521 | + |
| 522 | + return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); |
| 523 | +} |
| 524 | + |
| 525 | +static void __exit blake2s_mod_exit(void) |
| 526 | +{ |
| 527 | + if (boot_cpu_has(X86_FEATURE_SSSE3)) |
| 528 | + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); |
| 529 | +} |
| 530 | + |
| 531 | +module_init(blake2s_mod_init); |
| 532 | +module_exit(blake2s_mod_exit); |
| 533 | + |
| 534 | +MODULE_ALIAS_CRYPTO("blake2s-128"); |
| 535 | +MODULE_ALIAS_CRYPTO("blake2s-128-x86"); |
| 536 | +MODULE_ALIAS_CRYPTO("blake2s-160"); |
| 537 | +MODULE_ALIAS_CRYPTO("blake2s-160-x86"); |
| 538 | +MODULE_ALIAS_CRYPTO("blake2s-224"); |
| 539 | +MODULE_ALIAS_CRYPTO("blake2s-224-x86"); |
| 540 | +MODULE_ALIAS_CRYPTO("blake2s-256"); |
| 541 | +MODULE_ALIAS_CRYPTO("blake2s-256-x86"); |
| 542 | +MODULE_LICENSE("GPL v2"); |
| 543 | --- a/crypto/Kconfig |
| 544 | +++ b/crypto/Kconfig |
| 545 | @@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S |
| 546 | |
| 547 | See https://blake2.net for further information. |
| 548 | |
| 549 | +config CRYPTO_BLAKE2S_X86 |
| 550 | + tristate "BLAKE2s digest algorithm (x86 accelerated version)" |
| 551 | + depends on X86 && 64BIT |
| 552 | + select CRYPTO_LIB_BLAKE2S_GENERIC |
| 553 | + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S |
| 554 | + |
| 555 | config CRYPTO_CRCT10DIF |
| 556 | tristate "CRCT10DIF algorithm" |
| 557 | select CRYPTO_HASH |