b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: Ard Biesheuvel <ardb@kernel.org> |
| 3 | Date: Fri, 6 Nov 2020 17:39:38 +0100 |
| 4 | Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling |
| 5 | |
| 6 | commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream. |
| 7 | |
| 8 | Based on lessons learnt from optimizing the 32-bit version of this driver, |
| 9 | we can simplify the arm64 version considerably, by reordering the final |
| 10 | two stores when the last block is not a multiple of 64 bytes. This removes |
| 11 | the need to use permutation instructions to calculate the elements that are |
| 12 | clobbered by the final overlapping store, given that the store of the |
| 13 | penultimate block now follows it, and that one carries the correct values |
| 14 | for those elements already. |
| 15 | |
| 16 | While at it, simplify the overlapping loads as well, by calculating the |
| 17 | address of the final overlapping load upfront, and switching to this |
| 18 | address for every load that would otherwise extend past the end of the |
| 19 | source buffer. |
| 20 | |
| 21 | There is no impact on performance, but the resulting code is substantially |
| 22 | smaller and easier to follow. |
| 23 | |
| 24 | Cc: Eric Biggers <ebiggers@google.com> |
| 25 | Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> |
| 26 | Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| 27 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 28 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 29 | --- |
| 30 | arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++----------------- |
| 31 | 1 file changed, 69 insertions(+), 124 deletions(-) |
| 32 | |
| 33 | --- a/arch/arm64/crypto/chacha-neon-core.S |
| 34 | +++ b/arch/arm64/crypto/chacha-neon-core.S |
| 35 | @@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon) |
| 36 | adr_l x10, .Lpermute |
| 37 | and x5, x4, #63 |
| 38 | add x10, x10, x5 |
| 39 | - add x11, x10, #64 |
| 40 | |
| 41 | // |
| 42 | // This function encrypts four consecutive ChaCha blocks by loading |
| 43 | @@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 ) |
| 44 | zip2 v31.4s, v14.4s, v15.4s |
| 45 | eor a15, a15, w9 |
| 46 | |
| 47 | - mov x3, #64 |
| 48 | + add x3, x2, x4 |
| 49 | + sub x3, x3, #128 // start of last block |
| 50 | + |
| 51 | subs x5, x4, #128 |
| 52 | - add x6, x5, x2 |
| 53 | - csel x3, x3, xzr, ge |
| 54 | - csel x2, x2, x6, ge |
| 55 | + csel x2, x2, x3, ge |
| 56 | |
| 57 | // interleave 64-bit words in state n, n+2 |
| 58 | zip1 v0.2d, v16.2d, v18.2d |
| 59 | @@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 ) |
| 60 | zip1 v8.2d, v17.2d, v19.2d |
| 61 | zip2 v12.2d, v17.2d, v19.2d |
| 62 | stp a2, a3, [x1, #-56] |
| 63 | - ld1 {v16.16b-v19.16b}, [x2], x3 |
| 64 | |
| 65 | subs x6, x4, #192 |
| 66 | - ccmp x3, xzr, #4, lt |
| 67 | - add x7, x6, x2 |
| 68 | - csel x3, x3, xzr, eq |
| 69 | - csel x2, x2, x7, eq |
| 70 | + ld1 {v16.16b-v19.16b}, [x2], #64 |
| 71 | + csel x2, x2, x3, ge |
| 72 | |
| 73 | zip1 v1.2d, v20.2d, v22.2d |
| 74 | zip2 v5.2d, v20.2d, v22.2d |
| 75 | @@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 ) |
| 76 | zip1 v9.2d, v21.2d, v23.2d |
| 77 | zip2 v13.2d, v21.2d, v23.2d |
| 78 | stp a6, a7, [x1, #-40] |
| 79 | - ld1 {v20.16b-v23.16b}, [x2], x3 |
| 80 | |
| 81 | subs x7, x4, #256 |
| 82 | - ccmp x3, xzr, #4, lt |
| 83 | - add x8, x7, x2 |
| 84 | - csel x3, x3, xzr, eq |
| 85 | - csel x2, x2, x8, eq |
| 86 | + ld1 {v20.16b-v23.16b}, [x2], #64 |
| 87 | + csel x2, x2, x3, ge |
| 88 | |
| 89 | zip1 v2.2d, v24.2d, v26.2d |
| 90 | zip2 v6.2d, v24.2d, v26.2d |
| 91 | @@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 ) |
| 92 | zip1 v10.2d, v25.2d, v27.2d |
| 93 | zip2 v14.2d, v25.2d, v27.2d |
| 94 | stp a10, a11, [x1, #-24] |
| 95 | - ld1 {v24.16b-v27.16b}, [x2], x3 |
| 96 | |
| 97 | subs x8, x4, #320 |
| 98 | - ccmp x3, xzr, #4, lt |
| 99 | - add x9, x8, x2 |
| 100 | - csel x2, x2, x9, eq |
| 101 | + ld1 {v24.16b-v27.16b}, [x2], #64 |
| 102 | + csel x2, x2, x3, ge |
| 103 | |
| 104 | zip1 v3.2d, v28.2d, v30.2d |
| 105 | zip2 v7.2d, v28.2d, v30.2d |
| 106 | @@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 ) |
| 107 | zip1 v11.2d, v29.2d, v31.2d |
| 108 | zip2 v15.2d, v29.2d, v31.2d |
| 109 | stp a14, a15, [x1, #-8] |
| 110 | + |
| 111 | + tbnz x5, #63, .Lt128 |
| 112 | ld1 {v28.16b-v31.16b}, [x2] |
| 113 | |
| 114 | // xor with corresponding input, write to output |
| 115 | - tbnz x5, #63, 0f |
| 116 | eor v16.16b, v16.16b, v0.16b |
| 117 | eor v17.16b, v17.16b, v1.16b |
| 118 | eor v18.16b, v18.16b, v2.16b |
| 119 | eor v19.16b, v19.16b, v3.16b |
| 120 | - st1 {v16.16b-v19.16b}, [x1], #64 |
| 121 | - cbz x5, .Lout |
| 122 | |
| 123 | - tbnz x6, #63, 1f |
| 124 | + tbnz x6, #63, .Lt192 |
| 125 | + |
| 126 | eor v20.16b, v20.16b, v4.16b |
| 127 | eor v21.16b, v21.16b, v5.16b |
| 128 | eor v22.16b, v22.16b, v6.16b |
| 129 | eor v23.16b, v23.16b, v7.16b |
| 130 | - st1 {v20.16b-v23.16b}, [x1], #64 |
| 131 | - cbz x6, .Lout |
| 132 | |
| 133 | - tbnz x7, #63, 2f |
| 134 | + st1 {v16.16b-v19.16b}, [x1], #64 |
| 135 | + tbnz x7, #63, .Lt256 |
| 136 | + |
| 137 | eor v24.16b, v24.16b, v8.16b |
| 138 | eor v25.16b, v25.16b, v9.16b |
| 139 | eor v26.16b, v26.16b, v10.16b |
| 140 | eor v27.16b, v27.16b, v11.16b |
| 141 | - st1 {v24.16b-v27.16b}, [x1], #64 |
| 142 | - cbz x7, .Lout |
| 143 | |
| 144 | - tbnz x8, #63, 3f |
| 145 | + st1 {v20.16b-v23.16b}, [x1], #64 |
| 146 | + tbnz x8, #63, .Lt320 |
| 147 | + |
| 148 | eor v28.16b, v28.16b, v12.16b |
| 149 | eor v29.16b, v29.16b, v13.16b |
| 150 | eor v30.16b, v30.16b, v14.16b |
| 151 | eor v31.16b, v31.16b, v15.16b |
| 152 | + |
| 153 | + st1 {v24.16b-v27.16b}, [x1], #64 |
| 154 | st1 {v28.16b-v31.16b}, [x1] |
| 155 | |
| 156 | .Lout: frame_pop |
| 157 | ret |
| 158 | |
| 159 | - // fewer than 128 bytes of in/output |
| 160 | -0: ld1 {v8.16b}, [x10] |
| 161 | - ld1 {v9.16b}, [x11] |
| 162 | - movi v10.16b, #16 |
| 163 | - sub x2, x1, #64 |
| 164 | - add x1, x1, x5 |
| 165 | - ld1 {v16.16b-v19.16b}, [x2] |
| 166 | - tbl v4.16b, {v0.16b-v3.16b}, v8.16b |
| 167 | - tbx v20.16b, {v16.16b-v19.16b}, v9.16b |
| 168 | - add v8.16b, v8.16b, v10.16b |
| 169 | - add v9.16b, v9.16b, v10.16b |
| 170 | - tbl v5.16b, {v0.16b-v3.16b}, v8.16b |
| 171 | - tbx v21.16b, {v16.16b-v19.16b}, v9.16b |
| 172 | - add v8.16b, v8.16b, v10.16b |
| 173 | - add v9.16b, v9.16b, v10.16b |
| 174 | - tbl v6.16b, {v0.16b-v3.16b}, v8.16b |
| 175 | - tbx v22.16b, {v16.16b-v19.16b}, v9.16b |
| 176 | - add v8.16b, v8.16b, v10.16b |
| 177 | - add v9.16b, v9.16b, v10.16b |
| 178 | - tbl v7.16b, {v0.16b-v3.16b}, v8.16b |
| 179 | - tbx v23.16b, {v16.16b-v19.16b}, v9.16b |
| 180 | - |
| 181 | - eor v20.16b, v20.16b, v4.16b |
| 182 | - eor v21.16b, v21.16b, v5.16b |
| 183 | - eor v22.16b, v22.16b, v6.16b |
| 184 | - eor v23.16b, v23.16b, v7.16b |
| 185 | - st1 {v20.16b-v23.16b}, [x1] |
| 186 | - b .Lout |
| 187 | - |
| 188 | // fewer than 192 bytes of in/output |
| 189 | -1: ld1 {v8.16b}, [x10] |
| 190 | - ld1 {v9.16b}, [x11] |
| 191 | - movi v10.16b, #16 |
| 192 | - add x1, x1, x6 |
| 193 | - tbl v0.16b, {v4.16b-v7.16b}, v8.16b |
| 194 | - tbx v20.16b, {v16.16b-v19.16b}, v9.16b |
| 195 | - add v8.16b, v8.16b, v10.16b |
| 196 | - add v9.16b, v9.16b, v10.16b |
| 197 | - tbl v1.16b, {v4.16b-v7.16b}, v8.16b |
| 198 | - tbx v21.16b, {v16.16b-v19.16b}, v9.16b |
| 199 | - add v8.16b, v8.16b, v10.16b |
| 200 | - add v9.16b, v9.16b, v10.16b |
| 201 | - tbl v2.16b, {v4.16b-v7.16b}, v8.16b |
| 202 | - tbx v22.16b, {v16.16b-v19.16b}, v9.16b |
| 203 | - add v8.16b, v8.16b, v10.16b |
| 204 | - add v9.16b, v9.16b, v10.16b |
| 205 | - tbl v3.16b, {v4.16b-v7.16b}, v8.16b |
| 206 | - tbx v23.16b, {v16.16b-v19.16b}, v9.16b |
| 207 | - |
| 208 | - eor v20.16b, v20.16b, v0.16b |
| 209 | - eor v21.16b, v21.16b, v1.16b |
| 210 | - eor v22.16b, v22.16b, v2.16b |
| 211 | - eor v23.16b, v23.16b, v3.16b |
| 212 | - st1 {v20.16b-v23.16b}, [x1] |
| 213 | +.Lt192: cbz x5, 1f // exactly 128 bytes? |
| 214 | + ld1 {v28.16b-v31.16b}, [x10] |
| 215 | + add x5, x5, x1 |
| 216 | + tbl v28.16b, {v4.16b-v7.16b}, v28.16b |
| 217 | + tbl v29.16b, {v4.16b-v7.16b}, v29.16b |
| 218 | + tbl v30.16b, {v4.16b-v7.16b}, v30.16b |
| 219 | + tbl v31.16b, {v4.16b-v7.16b}, v31.16b |
| 220 | + |
| 221 | +0: eor v20.16b, v20.16b, v28.16b |
| 222 | + eor v21.16b, v21.16b, v29.16b |
| 223 | + eor v22.16b, v22.16b, v30.16b |
| 224 | + eor v23.16b, v23.16b, v31.16b |
| 225 | + st1 {v20.16b-v23.16b}, [x5] // overlapping stores |
| 226 | +1: st1 {v16.16b-v19.16b}, [x1] |
| 227 | b .Lout |
| 228 | |
| 229 | + // fewer than 128 bytes of in/output |
| 230 | +.Lt128: ld1 {v28.16b-v31.16b}, [x10] |
| 231 | + add x5, x5, x1 |
| 232 | + sub x1, x1, #64 |
| 233 | + tbl v28.16b, {v0.16b-v3.16b}, v28.16b |
| 234 | + tbl v29.16b, {v0.16b-v3.16b}, v29.16b |
| 235 | + tbl v30.16b, {v0.16b-v3.16b}, v30.16b |
| 236 | + tbl v31.16b, {v0.16b-v3.16b}, v31.16b |
| 237 | + ld1 {v16.16b-v19.16b}, [x1] // reload first output block |
| 238 | + b 0b |
| 239 | + |
| 240 | // fewer than 256 bytes of in/output |
| 241 | -2: ld1 {v4.16b}, [x10] |
| 242 | - ld1 {v5.16b}, [x11] |
| 243 | - movi v6.16b, #16 |
| 244 | - add x1, x1, x7 |
| 245 | +.Lt256: cbz x6, 2f // exactly 192 bytes? |
| 246 | + ld1 {v4.16b-v7.16b}, [x10] |
| 247 | + add x6, x6, x1 |
| 248 | tbl v0.16b, {v8.16b-v11.16b}, v4.16b |
| 249 | - tbx v24.16b, {v20.16b-v23.16b}, v5.16b |
| 250 | - add v4.16b, v4.16b, v6.16b |
| 251 | - add v5.16b, v5.16b, v6.16b |
| 252 | - tbl v1.16b, {v8.16b-v11.16b}, v4.16b |
| 253 | - tbx v25.16b, {v20.16b-v23.16b}, v5.16b |
| 254 | - add v4.16b, v4.16b, v6.16b |
| 255 | - add v5.16b, v5.16b, v6.16b |
| 256 | - tbl v2.16b, {v8.16b-v11.16b}, v4.16b |
| 257 | - tbx v26.16b, {v20.16b-v23.16b}, v5.16b |
| 258 | - add v4.16b, v4.16b, v6.16b |
| 259 | - add v5.16b, v5.16b, v6.16b |
| 260 | - tbl v3.16b, {v8.16b-v11.16b}, v4.16b |
| 261 | - tbx v27.16b, {v20.16b-v23.16b}, v5.16b |
| 262 | - |
| 263 | - eor v24.16b, v24.16b, v0.16b |
| 264 | - eor v25.16b, v25.16b, v1.16b |
| 265 | - eor v26.16b, v26.16b, v2.16b |
| 266 | - eor v27.16b, v27.16b, v3.16b |
| 267 | - st1 {v24.16b-v27.16b}, [x1] |
| 268 | + tbl v1.16b, {v8.16b-v11.16b}, v5.16b |
| 269 | + tbl v2.16b, {v8.16b-v11.16b}, v6.16b |
| 270 | + tbl v3.16b, {v8.16b-v11.16b}, v7.16b |
| 271 | + |
| 272 | + eor v28.16b, v28.16b, v0.16b |
| 273 | + eor v29.16b, v29.16b, v1.16b |
| 274 | + eor v30.16b, v30.16b, v2.16b |
| 275 | + eor v31.16b, v31.16b, v3.16b |
| 276 | + st1 {v28.16b-v31.16b}, [x6] // overlapping stores |
| 277 | +2: st1 {v20.16b-v23.16b}, [x1] |
| 278 | b .Lout |
| 279 | |
| 280 | // fewer than 320 bytes of in/output |
| 281 | -3: ld1 {v4.16b}, [x10] |
| 282 | - ld1 {v5.16b}, [x11] |
| 283 | - movi v6.16b, #16 |
| 284 | - add x1, x1, x8 |
| 285 | +.Lt320: cbz x7, 3f // exactly 256 bytes? |
| 286 | + ld1 {v4.16b-v7.16b}, [x10] |
| 287 | + add x7, x7, x1 |
| 288 | tbl v0.16b, {v12.16b-v15.16b}, v4.16b |
| 289 | - tbx v28.16b, {v24.16b-v27.16b}, v5.16b |
| 290 | - add v4.16b, v4.16b, v6.16b |
| 291 | - add v5.16b, v5.16b, v6.16b |
| 292 | - tbl v1.16b, {v12.16b-v15.16b}, v4.16b |
| 293 | - tbx v29.16b, {v24.16b-v27.16b}, v5.16b |
| 294 | - add v4.16b, v4.16b, v6.16b |
| 295 | - add v5.16b, v5.16b, v6.16b |
| 296 | - tbl v2.16b, {v12.16b-v15.16b}, v4.16b |
| 297 | - tbx v30.16b, {v24.16b-v27.16b}, v5.16b |
| 298 | - add v4.16b, v4.16b, v6.16b |
| 299 | - add v5.16b, v5.16b, v6.16b |
| 300 | - tbl v3.16b, {v12.16b-v15.16b}, v4.16b |
| 301 | - tbx v31.16b, {v24.16b-v27.16b}, v5.16b |
| 302 | + tbl v1.16b, {v12.16b-v15.16b}, v5.16b |
| 303 | + tbl v2.16b, {v12.16b-v15.16b}, v6.16b |
| 304 | + tbl v3.16b, {v12.16b-v15.16b}, v7.16b |
| 305 | |
| 306 | eor v28.16b, v28.16b, v0.16b |
| 307 | eor v29.16b, v29.16b, v1.16b |
| 308 | eor v30.16b, v30.16b, v2.16b |
| 309 | eor v31.16b, v31.16b, v3.16b |
| 310 | - st1 {v28.16b-v31.16b}, [x1] |
| 311 | + st1 {v28.16b-v31.16b}, [x7] // overlapping stores |
| 312 | +3: st1 {v24.16b-v27.16b}, [x1] |
| 313 | b .Lout |
| 314 | ENDPROC(chacha_4block_xor_neon) |
| 315 | |
| 316 | @@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon) |
| 317 | .align L1_CACHE_SHIFT |
| 318 | .Lpermute: |
| 319 | .set .Li, 0 |
| 320 | - .rept 192 |
| 321 | + .rept 128 |
| 322 | .byte (.Li - 64) |
| 323 | .set .Li, .Li + 1 |
| 324 | .endr |