| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| From: Ard Biesheuvel <ardb@kernel.org> |
| Date: Fri, 6 Nov 2020 17:39:38 +0100 |
| Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling |
| |
| commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream. |
| |
| Based on lessons learnt from optimizing the 32-bit version of this driver, |
| we can simplify the arm64 version considerably, by reordering the final |
| two stores when the last block is not a multiple of 64 bytes. This removes |
| the need to use permutation instructions to calculate the elements that are |
| clobbered by the final overlapping store, given that the store of the |
| penultimate block now follows it, and that one carries the correct values |
| for those elements already. |
| |
| While at it, simplify the overlapping loads as well, by calculating the |
| address of the final overlapping load upfront, and switching to this |
| address for every load that would otherwise extend past the end of the |
| source buffer. |
| |
| There is no impact on performance, but the resulting code is substantially |
| smaller and easier to follow. |
| |
| Cc: Eric Biggers <ebiggers@google.com> |
| Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> |
| Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| --- |
| arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++----------------- |
| 1 file changed, 69 insertions(+), 124 deletions(-) |
| |
| --- a/arch/arm64/crypto/chacha-neon-core.S |
| +++ b/arch/arm64/crypto/chacha-neon-core.S |
| @@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon) |
| adr_l x10, .Lpermute |
| and x5, x4, #63 |
| add x10, x10, x5 |
| - add x11, x10, #64 |
| |
| // |
| // This function encrypts four consecutive ChaCha blocks by loading |
| @@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 ) |
| zip2 v31.4s, v14.4s, v15.4s |
| eor a15, a15, w9 |
| |
| - mov x3, #64 |
| + add x3, x2, x4 |
| + sub x3, x3, #128 // start of last block |
| + |
| subs x5, x4, #128 |
| - add x6, x5, x2 |
| - csel x3, x3, xzr, ge |
| - csel x2, x2, x6, ge |
| + csel x2, x2, x3, ge |
| |
| // interleave 64-bit words in state n, n+2 |
| zip1 v0.2d, v16.2d, v18.2d |
| @@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 ) |
| zip1 v8.2d, v17.2d, v19.2d |
| zip2 v12.2d, v17.2d, v19.2d |
| stp a2, a3, [x1, #-56] |
| - ld1 {v16.16b-v19.16b}, [x2], x3 |
| |
| subs x6, x4, #192 |
| - ccmp x3, xzr, #4, lt |
| - add x7, x6, x2 |
| - csel x3, x3, xzr, eq |
| - csel x2, x2, x7, eq |
| + ld1 {v16.16b-v19.16b}, [x2], #64 |
| + csel x2, x2, x3, ge |
| |
| zip1 v1.2d, v20.2d, v22.2d |
| zip2 v5.2d, v20.2d, v22.2d |
| @@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 ) |
| zip1 v9.2d, v21.2d, v23.2d |
| zip2 v13.2d, v21.2d, v23.2d |
| stp a6, a7, [x1, #-40] |
| - ld1 {v20.16b-v23.16b}, [x2], x3 |
| |
| subs x7, x4, #256 |
| - ccmp x3, xzr, #4, lt |
| - add x8, x7, x2 |
| - csel x3, x3, xzr, eq |
| - csel x2, x2, x8, eq |
| + ld1 {v20.16b-v23.16b}, [x2], #64 |
| + csel x2, x2, x3, ge |
| |
| zip1 v2.2d, v24.2d, v26.2d |
| zip2 v6.2d, v24.2d, v26.2d |
| @@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 ) |
| zip1 v10.2d, v25.2d, v27.2d |
| zip2 v14.2d, v25.2d, v27.2d |
| stp a10, a11, [x1, #-24] |
| - ld1 {v24.16b-v27.16b}, [x2], x3 |
| |
| subs x8, x4, #320 |
| - ccmp x3, xzr, #4, lt |
| - add x9, x8, x2 |
| - csel x2, x2, x9, eq |
| + ld1 {v24.16b-v27.16b}, [x2], #64 |
| + csel x2, x2, x3, ge |
| |
| zip1 v3.2d, v28.2d, v30.2d |
| zip2 v7.2d, v28.2d, v30.2d |
| @@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 ) |
| zip1 v11.2d, v29.2d, v31.2d |
| zip2 v15.2d, v29.2d, v31.2d |
| stp a14, a15, [x1, #-8] |
| + |
| + tbnz x5, #63, .Lt128 |
| ld1 {v28.16b-v31.16b}, [x2] |
| |
| // xor with corresponding input, write to output |
| - tbnz x5, #63, 0f |
| eor v16.16b, v16.16b, v0.16b |
| eor v17.16b, v17.16b, v1.16b |
| eor v18.16b, v18.16b, v2.16b |
| eor v19.16b, v19.16b, v3.16b |
| - st1 {v16.16b-v19.16b}, [x1], #64 |
| - cbz x5, .Lout |
| |
| - tbnz x6, #63, 1f |
| + tbnz x6, #63, .Lt192 |
| + |
| eor v20.16b, v20.16b, v4.16b |
| eor v21.16b, v21.16b, v5.16b |
| eor v22.16b, v22.16b, v6.16b |
| eor v23.16b, v23.16b, v7.16b |
| - st1 {v20.16b-v23.16b}, [x1], #64 |
| - cbz x6, .Lout |
| |
| - tbnz x7, #63, 2f |
| + st1 {v16.16b-v19.16b}, [x1], #64 |
| + tbnz x7, #63, .Lt256 |
| + |
| eor v24.16b, v24.16b, v8.16b |
| eor v25.16b, v25.16b, v9.16b |
| eor v26.16b, v26.16b, v10.16b |
| eor v27.16b, v27.16b, v11.16b |
| - st1 {v24.16b-v27.16b}, [x1], #64 |
| - cbz x7, .Lout |
| |
| - tbnz x8, #63, 3f |
| + st1 {v20.16b-v23.16b}, [x1], #64 |
| + tbnz x8, #63, .Lt320 |
| + |
| eor v28.16b, v28.16b, v12.16b |
| eor v29.16b, v29.16b, v13.16b |
| eor v30.16b, v30.16b, v14.16b |
| eor v31.16b, v31.16b, v15.16b |
| + |
| + st1 {v24.16b-v27.16b}, [x1], #64 |
| st1 {v28.16b-v31.16b}, [x1] |
| |
| .Lout: frame_pop |
| ret |
| |
| - // fewer than 128 bytes of in/output |
| -0: ld1 {v8.16b}, [x10] |
| - ld1 {v9.16b}, [x11] |
| - movi v10.16b, #16 |
| - sub x2, x1, #64 |
| - add x1, x1, x5 |
| - ld1 {v16.16b-v19.16b}, [x2] |
| - tbl v4.16b, {v0.16b-v3.16b}, v8.16b |
| - tbx v20.16b, {v16.16b-v19.16b}, v9.16b |
| - add v8.16b, v8.16b, v10.16b |
| - add v9.16b, v9.16b, v10.16b |
| - tbl v5.16b, {v0.16b-v3.16b}, v8.16b |
| - tbx v21.16b, {v16.16b-v19.16b}, v9.16b |
| - add v8.16b, v8.16b, v10.16b |
| - add v9.16b, v9.16b, v10.16b |
| - tbl v6.16b, {v0.16b-v3.16b}, v8.16b |
| - tbx v22.16b, {v16.16b-v19.16b}, v9.16b |
| - add v8.16b, v8.16b, v10.16b |
| - add v9.16b, v9.16b, v10.16b |
| - tbl v7.16b, {v0.16b-v3.16b}, v8.16b |
| - tbx v23.16b, {v16.16b-v19.16b}, v9.16b |
| - |
| - eor v20.16b, v20.16b, v4.16b |
| - eor v21.16b, v21.16b, v5.16b |
| - eor v22.16b, v22.16b, v6.16b |
| - eor v23.16b, v23.16b, v7.16b |
| - st1 {v20.16b-v23.16b}, [x1] |
| - b .Lout |
| - |
| // fewer than 192 bytes of in/output |
| -1: ld1 {v8.16b}, [x10] |
| - ld1 {v9.16b}, [x11] |
| - movi v10.16b, #16 |
| - add x1, x1, x6 |
| - tbl v0.16b, {v4.16b-v7.16b}, v8.16b |
| - tbx v20.16b, {v16.16b-v19.16b}, v9.16b |
| - add v8.16b, v8.16b, v10.16b |
| - add v9.16b, v9.16b, v10.16b |
| - tbl v1.16b, {v4.16b-v7.16b}, v8.16b |
| - tbx v21.16b, {v16.16b-v19.16b}, v9.16b |
| - add v8.16b, v8.16b, v10.16b |
| - add v9.16b, v9.16b, v10.16b |
| - tbl v2.16b, {v4.16b-v7.16b}, v8.16b |
| - tbx v22.16b, {v16.16b-v19.16b}, v9.16b |
| - add v8.16b, v8.16b, v10.16b |
| - add v9.16b, v9.16b, v10.16b |
| - tbl v3.16b, {v4.16b-v7.16b}, v8.16b |
| - tbx v23.16b, {v16.16b-v19.16b}, v9.16b |
| - |
| - eor v20.16b, v20.16b, v0.16b |
| - eor v21.16b, v21.16b, v1.16b |
| - eor v22.16b, v22.16b, v2.16b |
| - eor v23.16b, v23.16b, v3.16b |
| - st1 {v20.16b-v23.16b}, [x1] |
| +.Lt192: cbz x5, 1f // exactly 128 bytes? |
| + ld1 {v28.16b-v31.16b}, [x10] |
| + add x5, x5, x1 |
| + tbl v28.16b, {v4.16b-v7.16b}, v28.16b |
| + tbl v29.16b, {v4.16b-v7.16b}, v29.16b |
| + tbl v30.16b, {v4.16b-v7.16b}, v30.16b |
| + tbl v31.16b, {v4.16b-v7.16b}, v31.16b |
| + |
| +0: eor v20.16b, v20.16b, v28.16b |
| + eor v21.16b, v21.16b, v29.16b |
| + eor v22.16b, v22.16b, v30.16b |
| + eor v23.16b, v23.16b, v31.16b |
| + st1 {v20.16b-v23.16b}, [x5] // overlapping stores |
| +1: st1 {v16.16b-v19.16b}, [x1] |
| b .Lout |
| |
| + // fewer than 128 bytes of in/output |
| +.Lt128: ld1 {v28.16b-v31.16b}, [x10] |
| + add x5, x5, x1 |
| + sub x1, x1, #64 |
| + tbl v28.16b, {v0.16b-v3.16b}, v28.16b |
| + tbl v29.16b, {v0.16b-v3.16b}, v29.16b |
| + tbl v30.16b, {v0.16b-v3.16b}, v30.16b |
| + tbl v31.16b, {v0.16b-v3.16b}, v31.16b |
| + ld1 {v16.16b-v19.16b}, [x1] // reload first output block |
| + b 0b |
| + |
| // fewer than 256 bytes of in/output |
| -2: ld1 {v4.16b}, [x10] |
| - ld1 {v5.16b}, [x11] |
| - movi v6.16b, #16 |
| - add x1, x1, x7 |
| +.Lt256: cbz x6, 2f // exactly 192 bytes? |
| + ld1 {v4.16b-v7.16b}, [x10] |
| + add x6, x6, x1 |
| tbl v0.16b, {v8.16b-v11.16b}, v4.16b |
| - tbx v24.16b, {v20.16b-v23.16b}, v5.16b |
| - add v4.16b, v4.16b, v6.16b |
| - add v5.16b, v5.16b, v6.16b |
| - tbl v1.16b, {v8.16b-v11.16b}, v4.16b |
| - tbx v25.16b, {v20.16b-v23.16b}, v5.16b |
| - add v4.16b, v4.16b, v6.16b |
| - add v5.16b, v5.16b, v6.16b |
| - tbl v2.16b, {v8.16b-v11.16b}, v4.16b |
| - tbx v26.16b, {v20.16b-v23.16b}, v5.16b |
| - add v4.16b, v4.16b, v6.16b |
| - add v5.16b, v5.16b, v6.16b |
| - tbl v3.16b, {v8.16b-v11.16b}, v4.16b |
| - tbx v27.16b, {v20.16b-v23.16b}, v5.16b |
| - |
| - eor v24.16b, v24.16b, v0.16b |
| - eor v25.16b, v25.16b, v1.16b |
| - eor v26.16b, v26.16b, v2.16b |
| - eor v27.16b, v27.16b, v3.16b |
| - st1 {v24.16b-v27.16b}, [x1] |
| + tbl v1.16b, {v8.16b-v11.16b}, v5.16b |
| + tbl v2.16b, {v8.16b-v11.16b}, v6.16b |
| + tbl v3.16b, {v8.16b-v11.16b}, v7.16b |
| + |
| + eor v28.16b, v28.16b, v0.16b |
| + eor v29.16b, v29.16b, v1.16b |
| + eor v30.16b, v30.16b, v2.16b |
| + eor v31.16b, v31.16b, v3.16b |
| + st1 {v28.16b-v31.16b}, [x6] // overlapping stores |
| +2: st1 {v20.16b-v23.16b}, [x1] |
| b .Lout |
| |
| // fewer than 320 bytes of in/output |
| -3: ld1 {v4.16b}, [x10] |
| - ld1 {v5.16b}, [x11] |
| - movi v6.16b, #16 |
| - add x1, x1, x8 |
| +.Lt320: cbz x7, 3f // exactly 256 bytes? |
| + ld1 {v4.16b-v7.16b}, [x10] |
| + add x7, x7, x1 |
| tbl v0.16b, {v12.16b-v15.16b}, v4.16b |
| - tbx v28.16b, {v24.16b-v27.16b}, v5.16b |
| - add v4.16b, v4.16b, v6.16b |
| - add v5.16b, v5.16b, v6.16b |
| - tbl v1.16b, {v12.16b-v15.16b}, v4.16b |
| - tbx v29.16b, {v24.16b-v27.16b}, v5.16b |
| - add v4.16b, v4.16b, v6.16b |
| - add v5.16b, v5.16b, v6.16b |
| - tbl v2.16b, {v12.16b-v15.16b}, v4.16b |
| - tbx v30.16b, {v24.16b-v27.16b}, v5.16b |
| - add v4.16b, v4.16b, v6.16b |
| - add v5.16b, v5.16b, v6.16b |
| - tbl v3.16b, {v12.16b-v15.16b}, v4.16b |
| - tbx v31.16b, {v24.16b-v27.16b}, v5.16b |
| + tbl v1.16b, {v12.16b-v15.16b}, v5.16b |
| + tbl v2.16b, {v12.16b-v15.16b}, v6.16b |
| + tbl v3.16b, {v12.16b-v15.16b}, v7.16b |
| |
| eor v28.16b, v28.16b, v0.16b |
| eor v29.16b, v29.16b, v1.16b |
| eor v30.16b, v30.16b, v2.16b |
| eor v31.16b, v31.16b, v3.16b |
| - st1 {v28.16b-v31.16b}, [x1] |
| + st1 {v28.16b-v31.16b}, [x7] // overlapping stores |
| +3: st1 {v24.16b-v27.16b}, [x1] |
| b .Lout |
| ENDPROC(chacha_4block_xor_neon) |
| |
| @@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon) |
| .align L1_CACHE_SHIFT |
| .Lpermute: |
| .set .Li, 0 |
| - .rept 192 |
| + .rept 128 |
| .byte (.Li - 64) |
| .set .Li, .Li + 1 |
| .endr |