| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| From: Ard Biesheuvel <ardb@kernel.org> |
| Date: Fri, 8 Nov 2019 13:22:10 +0100 |
| Subject: [PATCH] crypto: x86/chacha - expose SIMD ChaCha routine as library |
| function |
| |
| commit 84e03fa39fbe95a5567d43bff458c6d3b3a23ad1 upstream. |
| |
| Wire the existing x86 SIMD ChaCha code into the new ChaCha library |
| interface, so that users of the library interface will get the |
| accelerated version when available. |
| |
| Given that calls into the library API will always go through the |
| routines in this module if it is enabled, switch to static keys |
| to select the optimal implementation available (which may be none |
| at all, in which case we defer to the generic implementation for |
| all invocations). |
| |
| Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| --- |
| arch/x86/crypto/chacha_glue.c | 91 +++++++++++++++++++++++++---------- |
| crypto/Kconfig | 1 + |
| include/crypto/chacha.h | 6 +++ |
| 3 files changed, 73 insertions(+), 25 deletions(-) |
| |
| --- a/arch/x86/crypto/chacha_glue.c |
| +++ b/arch/x86/crypto/chacha_glue.c |
| @@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u |
| asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); |
| -#ifdef CONFIG_AS_AVX2 |
| + |
| asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| -static bool chacha_use_avx2; |
| -#ifdef CONFIG_AS_AVX512 |
| + |
| asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, |
| unsigned int len, int nrounds); |
| -static bool chacha_use_avx512vl; |
| -#endif |
| -#endif |
| + |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); |
| |
| static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) |
| { |
| @@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsig |
| static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, |
| unsigned int bytes, int nrounds) |
| { |
| -#ifdef CONFIG_AS_AVX2 |
| -#ifdef CONFIG_AS_AVX512 |
| - if (chacha_use_avx512vl) { |
| + if (IS_ENABLED(CONFIG_AS_AVX512) && |
| + static_branch_likely(&chacha_use_avx512vl)) { |
| while (bytes >= CHACHA_BLOCK_SIZE * 8) { |
| chacha_8block_xor_avx512vl(state, dst, src, bytes, |
| nrounds); |
| @@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8 |
| return; |
| } |
| } |
| -#endif |
| - if (chacha_use_avx2) { |
| + |
| + if (IS_ENABLED(CONFIG_AS_AVX2) && |
| + static_branch_likely(&chacha_use_avx2)) { |
| while (bytes >= CHACHA_BLOCK_SIZE * 8) { |
| chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); |
| bytes -= CHACHA_BLOCK_SIZE * 8; |
| @@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8 |
| return; |
| } |
| } |
| -#endif |
| + |
| while (bytes >= CHACHA_BLOCK_SIZE * 4) { |
| chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); |
| bytes -= CHACHA_BLOCK_SIZE * 4; |
| @@ -123,6 +123,43 @@ static void chacha_dosimd(u32 *state, u8 |
| } |
| } |
| |
| +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) |
| +{ |
| + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); |
| + |
| + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { |
| + hchacha_block_generic(state, stream, nrounds); |
| + } else { |
| + kernel_fpu_begin(); |
| + hchacha_block_ssse3(state, stream, nrounds); |
| + kernel_fpu_end(); |
| + } |
| +} |
| +EXPORT_SYMBOL(hchacha_block_arch); |
| + |
| +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) |
| +{ |
| + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); |
| + |
| + chacha_init_generic(state, key, iv); |
| +} |
| +EXPORT_SYMBOL(chacha_init_arch); |
| + |
| +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, |
| + int nrounds) |
| +{ |
| + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); |
| + |
| + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || |
| + bytes <= CHACHA_BLOCK_SIZE) |
| + return chacha_crypt_generic(state, dst, src, bytes, nrounds); |
| + |
| + kernel_fpu_begin(); |
| + chacha_dosimd(state, dst, src, bytes, nrounds); |
| + kernel_fpu_end(); |
| +} |
| +EXPORT_SYMBOL(chacha_crypt_arch); |
| + |
| static int chacha_simd_stream_xor(struct skcipher_request *req, |
| const struct chacha_ctx *ctx, const u8 *iv) |
| { |
| @@ -143,7 +180,8 @@ static int chacha_simd_stream_xor(struct |
| if (nbytes < walk.total) |
| nbytes = round_down(nbytes, walk.stride); |
| |
| - if (!crypto_simd_usable()) { |
| + if (!static_branch_likely(&chacha_use_simd) || |
| + !crypto_simd_usable()) { |
| chacha_crypt_generic(state, walk.dst.virt.addr, |
| walk.src.virt.addr, nbytes, |
| ctx->nrounds); |
| @@ -246,18 +284,21 @@ static struct skcipher_alg algs[] = { |
| static int __init chacha_simd_mod_init(void) |
| { |
| if (!boot_cpu_has(X86_FEATURE_SSSE3)) |
| - return -ENODEV; |
| + return 0; |
| |
| -#ifdef CONFIG_AS_AVX2 |
| - chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && |
| - boot_cpu_has(X86_FEATURE_AVX2) && |
| - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); |
| -#ifdef CONFIG_AS_AVX512 |
| - chacha_use_avx512vl = chacha_use_avx2 && |
| - boot_cpu_has(X86_FEATURE_AVX512VL) && |
| - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ |
| -#endif |
| -#endif |
| + static_branch_enable(&chacha_use_simd); |
| + |
| + if (IS_ENABLED(CONFIG_AS_AVX2) && |
| + boot_cpu_has(X86_FEATURE_AVX) && |
| + boot_cpu_has(X86_FEATURE_AVX2) && |
| + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { |
| + static_branch_enable(&chacha_use_avx2); |
| + |
| + if (IS_ENABLED(CONFIG_AS_AVX512) && |
| + boot_cpu_has(X86_FEATURE_AVX512VL) && |
| + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ |
| + static_branch_enable(&chacha_use_avx512vl); |
| + } |
| return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); |
| } |
| |
| --- a/crypto/Kconfig |
| +++ b/crypto/Kconfig |
| @@ -1418,6 +1418,7 @@ config CRYPTO_CHACHA20_X86_64 |
| depends on X86 && 64BIT |
| select CRYPTO_BLKCIPHER |
| select CRYPTO_LIB_CHACHA_GENERIC |
| + select CRYPTO_ARCH_HAVE_LIB_CHACHA |
| help |
| SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20, |
| XChaCha20, and XChaCha12 stream ciphers. |
| --- a/include/crypto/chacha.h |
| +++ b/include/crypto/chacha.h |
| @@ -25,6 +25,12 @@ |
| #define CHACHA_BLOCK_SIZE 64 |
| #define CHACHAPOLY_IV_SIZE 12 |
| |
| +#ifdef CONFIG_X86_64 |
| +#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32)) |
| +#else |
| +#define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32)) |
| +#endif |
| + |
| /* 192-bit nonce, then 64-bit stream position */ |
| #define XCHACHA_IV_SIZE 32 |
| |