b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: Ard Biesheuvel <ardb@kernel.org> |
| 3 | Date: Wed, 8 Jul 2020 12:11:18 +0300 |
| 4 | Subject: [PATCH] crypto: x86/chacha-sse3 - use unaligned loads for state array |
| 5 | |
| 6 | commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream. |
| 7 | |
| 8 | Due to the fact that the x86 port does not support allocating objects |
| 9 | on the stack with an alignment that exceeds 8 bytes, we have a rather |
| 10 | ugly hack in the x86 code for ChaCha to ensure that the state array is |
| 11 | aligned to 16 bytes, allowing the SSE3 implementation of the algorithm |
| 12 | to use aligned loads. |
| 13 | |
| 14 | Given that the performance benefit of using of aligned loads appears to |
| 15 | be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and |
| 16 | the fact that this hack has leaked into generic ChaCha code, let's just |
| 17 | remove it. |
| 18 | |
| 19 | Cc: Martin Willi <martin@strongswan.org> |
| 20 | Cc: Herbert Xu <herbert@gondor.apana.org.au> |
| 21 | Cc: Eric Biggers <ebiggers@kernel.org> |
| 22 | Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| 23 | Reviewed-by: Martin Willi <martin@strongswan.org> |
| 24 | Reviewed-by: Eric Biggers <ebiggers@google.com> |
| 25 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 26 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 27 | --- |
| 28 | arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++-------- |
| 29 | arch/x86/crypto/chacha_glue.c | 17 ++--------------- |
| 30 | include/crypto/chacha.h | 4 ---- |
| 31 | 3 files changed, 10 insertions(+), 27 deletions(-) |
| 32 | |
| 33 | --- a/arch/x86/crypto/chacha-ssse3-x86_64.S |
| 34 | +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S |
| 35 | @@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3) |
| 36 | FRAME_BEGIN |
| 37 | |
| 38 | # x0..3 = s0..3 |
| 39 | - movdqa 0x00(%rdi),%xmm0 |
| 40 | - movdqa 0x10(%rdi),%xmm1 |
| 41 | - movdqa 0x20(%rdi),%xmm2 |
| 42 | - movdqa 0x30(%rdi),%xmm3 |
| 43 | + movdqu 0x00(%rdi),%xmm0 |
| 44 | + movdqu 0x10(%rdi),%xmm1 |
| 45 | + movdqu 0x20(%rdi),%xmm2 |
| 46 | + movdqu 0x30(%rdi),%xmm3 |
| 47 | movdqa %xmm0,%xmm8 |
| 48 | movdqa %xmm1,%xmm9 |
| 49 | movdqa %xmm2,%xmm10 |
| 50 | @@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3) |
| 51 | # %edx: nrounds |
| 52 | FRAME_BEGIN |
| 53 | |
| 54 | - movdqa 0x00(%rdi),%xmm0 |
| 55 | - movdqa 0x10(%rdi),%xmm1 |
| 56 | - movdqa 0x20(%rdi),%xmm2 |
| 57 | - movdqa 0x30(%rdi),%xmm3 |
| 58 | + movdqu 0x00(%rdi),%xmm0 |
| 59 | + movdqu 0x10(%rdi),%xmm1 |
| 60 | + movdqu 0x20(%rdi),%xmm2 |
| 61 | + movdqu 0x30(%rdi),%xmm3 |
| 62 | |
| 63 | mov %edx,%r8d |
| 64 | call chacha_permute |
| 65 | --- a/arch/x86/crypto/chacha_glue.c |
| 66 | +++ b/arch/x86/crypto/chacha_glue.c |
| 67 | @@ -14,8 +14,6 @@ |
| 68 | #include <linux/module.h> |
| 69 | #include <asm/simd.h> |
| 70 | |
| 71 | -#define CHACHA_STATE_ALIGN 16 |
| 72 | - |
| 73 | asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, |
| 74 | unsigned int len, int nrounds); |
| 75 | asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, |
| 76 | @@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8 |
| 77 | |
| 78 | void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) |
| 79 | { |
| 80 | - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); |
| 81 | - |
| 82 | if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { |
| 83 | hchacha_block_generic(state, stream, nrounds); |
| 84 | } else { |
| 85 | @@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch); |
| 86 | |
| 87 | void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) |
| 88 | { |
| 89 | - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); |
| 90 | - |
| 91 | chacha_init_generic(state, key, iv); |
| 92 | } |
| 93 | EXPORT_SYMBOL(chacha_init_arch); |
| 94 | @@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch); |
| 95 | void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, |
| 96 | int nrounds) |
| 97 | { |
| 98 | - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); |
| 99 | - |
| 100 | if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || |
| 101 | bytes <= CHACHA_BLOCK_SIZE) |
| 102 | return chacha_crypt_generic(state, dst, src, bytes, nrounds); |
| 103 | @@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch); |
| 104 | static int chacha_simd_stream_xor(struct skcipher_request *req, |
| 105 | const struct chacha_ctx *ctx, const u8 *iv) |
| 106 | { |
| 107 | - u32 *state, state_buf[16 + 2] __aligned(8); |
| 108 | + u32 state[CHACHA_STATE_WORDS] __aligned(8); |
| 109 | struct skcipher_walk walk; |
| 110 | int err; |
| 111 | |
| 112 | err = skcipher_walk_virt(&walk, req, false); |
| 113 | |
| 114 | - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); |
| 115 | - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); |
| 116 | - |
| 117 | chacha_init_generic(state, ctx->key, iv); |
| 118 | |
| 119 | while (walk.nbytes > 0) { |
| 120 | @@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_ |
| 121 | { |
| 122 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); |
| 123 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); |
| 124 | - u32 *state, state_buf[16 + 2] __aligned(8); |
| 125 | + u32 state[CHACHA_STATE_WORDS] __aligned(8); |
| 126 | struct chacha_ctx subctx; |
| 127 | u8 real_iv[16]; |
| 128 | |
| 129 | - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); |
| 130 | - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); |
| 131 | chacha_init_generic(state, ctx->key, req->iv); |
| 132 | |
| 133 | if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { |
| 134 | --- a/include/crypto/chacha.h |
| 135 | +++ b/include/crypto/chacha.h |
| 136 | @@ -25,11 +25,7 @@ |
| 137 | #define CHACHA_BLOCK_SIZE 64 |
| 138 | #define CHACHAPOLY_IV_SIZE 12 |
| 139 | |
| 140 | -#ifdef CONFIG_X86_64 |
| 141 | -#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32)) |
| 142 | -#else |
| 143 | #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32)) |
| 144 | -#endif |
| 145 | |
| 146 | /* 192-bit nonce, then 64-bit stream position */ |
| 147 | #define XCHACHA_IV_SIZE 32 |