| b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | Date: Thu, 23 Apr 2020 15:54:04 -0600 |
| 4 | Subject: [PATCH] crypto: arch/lib - limit simd usage to 4k chunks |
| 5 | |
| 6 | commit 706024a52c614b478b63f7728d202532ce6591a9 upstream. |
| 7 | |
| 8 | The initial Zinc patchset, after some mailing list discussion, contained |
| 9 | code to ensure that kernel_fpu_enable would not be kept on for more than |
| 10 | a 4k chunk, since it disables preemption. The choice of 4k isn't totally |
| 11 | scientific, but it's not a bad guess either, and it's what's used in |
| 12 | both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form |
| 13 | of PAGE_SIZE, which this commit corrects to be explicitly 4k for the |
| 14 | former two). |
| 15 | |
| 16 | Ard did some back of the envelope calculations and found that |
| 17 | at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k |
| 18 | means we have a maximum preemption disabling of 20us, which Sebastian |
| 19 | confirmed was probably a good limit. |
| 20 | |
| 21 | Unfortunately the chunking appears to have been left out of the final |
| 22 | patchset that added the glue code. So, this commit adds it back in. |
| 23 | |
| 24 | Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function") |
| 25 | Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function") |
| 26 | Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function") |
| 27 | Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel") |
| 28 | Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") |
| 29 | Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") |
| 30 | Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation") |
| 31 | Cc: Eric Biggers <ebiggers@google.com> |
| 32 | Cc: Ard Biesheuvel <ardb@kernel.org> |
| 33 | Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| 34 | Cc: stable@vger.kernel.org |
| 35 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 36 | Reviewed-by: Ard Biesheuvel <ardb@kernel.org> |
| 37 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 38 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 39 | --- |
| 40 | arch/arm/crypto/chacha-glue.c | 14 +++++++++++--- |
| 41 | arch/arm/crypto/poly1305-glue.c | 15 +++++++++++---- |
| 42 | arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++--- |
| 43 | arch/arm64/crypto/poly1305-glue.c | 15 +++++++++++---- |
| 44 | arch/x86/crypto/blake2s-glue.c | 10 ++++------ |
| 45 | arch/x86/crypto/chacha_glue.c | 14 +++++++++++--- |
| 46 | arch/x86/crypto/poly1305_glue.c | 13 ++++++------- |
| 47 | 7 files changed, 65 insertions(+), 30 deletions(-) |
| 48 | |
| 49 | --- a/arch/arm/crypto/chacha-glue.c |
| 50 | +++ b/arch/arm/crypto/chacha-glue.c |
| 51 | @@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *d |
| 52 | return; |
| 53 | } |
| 54 | |
| 55 | - kernel_neon_begin(); |
| 56 | - chacha_doneon(state, dst, src, bytes, nrounds); |
| 57 | - kernel_neon_end(); |
| 58 | + do { |
| 59 | + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); |
| 60 | + |
| 61 | + kernel_neon_begin(); |
| 62 | + chacha_doneon(state, dst, src, todo, nrounds); |
| 63 | + kernel_neon_end(); |
| 64 | + |
| 65 | + bytes -= todo; |
| 66 | + src += todo; |
| 67 | + dst += todo; |
| 68 | + } while (bytes); |
| 69 | } |
| 70 | EXPORT_SYMBOL(chacha_crypt_arch); |
| 71 | |
| 72 | --- a/arch/arm/crypto/poly1305-glue.c |
| 73 | +++ b/arch/arm/crypto/poly1305-glue.c |
| 74 | @@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly130 |
| 75 | unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); |
| 76 | |
| 77 | if (static_branch_likely(&have_neon) && do_neon) { |
| 78 | - kernel_neon_begin(); |
| 79 | - poly1305_blocks_neon(&dctx->h, src, len, 1); |
| 80 | - kernel_neon_end(); |
| 81 | + do { |
| 82 | + unsigned int todo = min_t(unsigned int, len, SZ_4K); |
| 83 | + |
| 84 | + kernel_neon_begin(); |
| 85 | + poly1305_blocks_neon(&dctx->h, src, todo, 1); |
| 86 | + kernel_neon_end(); |
| 87 | + |
| 88 | + len -= todo; |
| 89 | + src += todo; |
| 90 | + } while (len); |
| 91 | } else { |
| 92 | poly1305_blocks_arm(&dctx->h, src, len, 1); |
| 93 | + src += len; |
| 94 | } |
| 95 | - src += len; |
| 96 | nbytes %= POLY1305_BLOCK_SIZE; |
| 97 | } |
| 98 | |
| 99 | --- a/arch/arm64/crypto/chacha-neon-glue.c |
| 100 | +++ b/arch/arm64/crypto/chacha-neon-glue.c |
| 101 | @@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *d |
| 102 | !crypto_simd_usable()) |
| 103 | return chacha_crypt_generic(state, dst, src, bytes, nrounds); |
| 104 | |
| 105 | - kernel_neon_begin(); |
| 106 | - chacha_doneon(state, dst, src, bytes, nrounds); |
| 107 | - kernel_neon_end(); |
| 108 | + do { |
| 109 | + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); |
| 110 | + |
| 111 | + kernel_neon_begin(); |
| 112 | + chacha_doneon(state, dst, src, todo, nrounds); |
| 113 | + kernel_neon_end(); |
| 114 | + |
| 115 | + bytes -= todo; |
| 116 | + src += todo; |
| 117 | + dst += todo; |
| 118 | + } while (bytes); |
| 119 | } |
| 120 | EXPORT_SYMBOL(chacha_crypt_arch); |
| 121 | |
| 122 | --- a/arch/arm64/crypto/poly1305-glue.c |
| 123 | +++ b/arch/arm64/crypto/poly1305-glue.c |
| 124 | @@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly130 |
| 125 | unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); |
| 126 | |
| 127 | if (static_branch_likely(&have_neon) && crypto_simd_usable()) { |
| 128 | - kernel_neon_begin(); |
| 129 | - poly1305_blocks_neon(&dctx->h, src, len, 1); |
| 130 | - kernel_neon_end(); |
| 131 | + do { |
| 132 | + unsigned int todo = min_t(unsigned int, len, SZ_4K); |
| 133 | + |
| 134 | + kernel_neon_begin(); |
| 135 | + poly1305_blocks_neon(&dctx->h, src, todo, 1); |
| 136 | + kernel_neon_end(); |
| 137 | + |
| 138 | + len -= todo; |
| 139 | + src += todo; |
| 140 | + } while (len); |
| 141 | } else { |
| 142 | poly1305_blocks(&dctx->h, src, len, 1); |
| 143 | + src += len; |
| 144 | } |
| 145 | - src += len; |
| 146 | nbytes %= POLY1305_BLOCK_SIZE; |
| 147 | } |
| 148 | |
| 149 | --- a/arch/x86/crypto/blake2s-glue.c |
| 150 | +++ b/arch/x86/crypto/blake2s-glue.c |
| 151 | @@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2 |
| 152 | const u32 inc) |
| 153 | { |
| 154 | /* SIMD disables preemption, so relax after processing each page. */ |
| 155 | - BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); |
| 156 | + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); |
| 157 | |
| 158 | if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { |
| 159 | blake2s_compress_generic(state, block, nblocks, inc); |
| 160 | return; |
| 161 | } |
| 162 | |
| 163 | - for (;;) { |
| 164 | + do { |
| 165 | const size_t blocks = min_t(size_t, nblocks, |
| 166 | - PAGE_SIZE / BLAKE2S_BLOCK_SIZE); |
| 167 | + SZ_4K / BLAKE2S_BLOCK_SIZE); |
| 168 | |
| 169 | kernel_fpu_begin(); |
| 170 | if (IS_ENABLED(CONFIG_AS_AVX512) && |
| 171 | @@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2 |
| 172 | kernel_fpu_end(); |
| 173 | |
| 174 | nblocks -= blocks; |
| 175 | - if (!nblocks) |
| 176 | - break; |
| 177 | block += blocks * BLAKE2S_BLOCK_SIZE; |
| 178 | - } |
| 179 | + } while (nblocks); |
| 180 | } |
| 181 | EXPORT_SYMBOL(blake2s_compress_arch); |
| 182 | |
| 183 | --- a/arch/x86/crypto/chacha_glue.c |
| 184 | +++ b/arch/x86/crypto/chacha_glue.c |
| 185 | @@ -154,9 +154,17 @@ void chacha_crypt_arch(u32 *state, u8 *d |
| 186 | bytes <= CHACHA_BLOCK_SIZE) |
| 187 | return chacha_crypt_generic(state, dst, src, bytes, nrounds); |
| 188 | |
| 189 | - kernel_fpu_begin(); |
| 190 | - chacha_dosimd(state, dst, src, bytes, nrounds); |
| 191 | - kernel_fpu_end(); |
| 192 | + do { |
| 193 | + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); |
| 194 | + |
| 195 | + kernel_fpu_begin(); |
| 196 | + chacha_dosimd(state, dst, src, todo, nrounds); |
| 197 | + kernel_fpu_end(); |
| 198 | + |
| 199 | + bytes -= todo; |
| 200 | + src += todo; |
| 201 | + dst += todo; |
| 202 | + } while (bytes); |
| 203 | } |
| 204 | EXPORT_SYMBOL(chacha_crypt_arch); |
| 205 | |
| 206 | --- a/arch/x86/crypto/poly1305_glue.c |
| 207 | +++ b/arch/x86/crypto/poly1305_glue.c |
| 208 | @@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *c |
| 209 | struct poly1305_arch_internal *state = ctx; |
| 210 | |
| 211 | /* SIMD disables preemption, so relax after processing each page. */ |
| 212 | - BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || |
| 213 | - PAGE_SIZE % POLY1305_BLOCK_SIZE); |
| 214 | + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || |
| 215 | + SZ_4K % POLY1305_BLOCK_SIZE); |
| 216 | |
| 217 | if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || |
| 218 | (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || |
| 219 | @@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *c |
| 220 | return; |
| 221 | } |
| 222 | |
| 223 | - for (;;) { |
| 224 | - const size_t bytes = min_t(size_t, len, PAGE_SIZE); |
| 225 | + do { |
| 226 | + const size_t bytes = min_t(size_t, len, SZ_4K); |
| 227 | |
| 228 | kernel_fpu_begin(); |
| 229 | if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) |
| 230 | @@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *c |
| 231 | else |
| 232 | poly1305_blocks_avx(ctx, inp, bytes, padbit); |
| 233 | kernel_fpu_end(); |
| 234 | + |
| 235 | len -= bytes; |
| 236 | - if (!len) |
| 237 | - break; |
| 238 | inp += bytes; |
| 239 | - } |
| 240 | + } while (len); |
| 241 | } |
| 242 | |
| 243 | static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |