| b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | Date: Fri, 8 Nov 2019 13:22:16 +0100 |
| 4 | Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc |
| 5 | MIME-Version: 1.0 |
| 6 | Content-Type: text/plain; charset=UTF-8 |
| 7 | Content-Transfer-Encoding: 8bit |
| 8 | |
| 9 | commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream. |
| 10 | |
| 11 | This imports the accelerated MIPS 32r2 ChaCha20 implementation from the |
| 12 | Zinc patch set. |
| 13 | |
| 14 | Co-developed-by: René van Dorst <opensource@vdorst.com> |
| 15 | Signed-off-by: René van Dorst <opensource@vdorst.com> |
| 16 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 17 | Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| 18 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 19 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 20 | --- |
| 21 | arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++ |
| 22 | 1 file changed, 424 insertions(+) |
| 23 | create mode 100644 arch/mips/crypto/chacha-core.S |
| 24 | |
| 25 | --- /dev/null |
| 26 | +++ b/arch/mips/crypto/chacha-core.S |
| 27 | @@ -0,0 +1,424 @@ |
| 28 | +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ |
| 29 | +/* |
| 30 | + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. |
| 31 | + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 32 | + */ |
| 33 | + |
| 34 | +#define MASK_U32 0x3c |
| 35 | +#define CHACHA20_BLOCK_SIZE 64 |
| 36 | +#define STACK_SIZE 32 |
| 37 | + |
| 38 | +#define X0 $t0 |
| 39 | +#define X1 $t1 |
| 40 | +#define X2 $t2 |
| 41 | +#define X3 $t3 |
| 42 | +#define X4 $t4 |
| 43 | +#define X5 $t5 |
| 44 | +#define X6 $t6 |
| 45 | +#define X7 $t7 |
| 46 | +#define X8 $t8 |
| 47 | +#define X9 $t9 |
| 48 | +#define X10 $v1 |
| 49 | +#define X11 $s6 |
| 50 | +#define X12 $s5 |
| 51 | +#define X13 $s4 |
| 52 | +#define X14 $s3 |
| 53 | +#define X15 $s2 |
| 54 | +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ |
| 55 | +#define T0 $s1 |
| 56 | +#define T1 $s0 |
| 57 | +#define T(n) T ## n |
| 58 | +#define X(n) X ## n |
| 59 | + |
| 60 | +/* Input arguments */ |
| 61 | +#define STATE $a0 |
| 62 | +#define OUT $a1 |
| 63 | +#define IN $a2 |
| 64 | +#define BYTES $a3 |
| 65 | + |
| 66 | +/* Output argument */ |
| 67 | +/* NONCE[0] is kept in a register and not in memory. |
| 68 | + * We don't want to touch original value in memory. |
| 69 | + * Must be incremented every loop iteration. |
| 70 | + */ |
| 71 | +#define NONCE_0 $v0 |
| 72 | + |
| 73 | +/* SAVED_X and SAVED_CA are set in the jump table. |
| 74 | + * Use regs which are overwritten on exit else we don't leak clear data. |
| 75 | + * They are used to handling the last bytes which are not multiple of 4. |
| 76 | + */ |
| 77 | +#define SAVED_X X15 |
| 78 | +#define SAVED_CA $s7 |
| 79 | + |
| 80 | +#define IS_UNALIGNED $s7 |
| 81 | + |
| 82 | +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
| 83 | +#define MSB 0 |
| 84 | +#define LSB 3 |
| 85 | +#define ROTx rotl |
| 86 | +#define ROTR(n) rotr n, 24 |
| 87 | +#define CPU_TO_LE32(n) \ |
| 88 | + wsbh n; \ |
| 89 | + rotr n, 16; |
| 90 | +#else |
| 91 | +#define MSB 3 |
| 92 | +#define LSB 0 |
| 93 | +#define ROTx rotr |
| 94 | +#define CPU_TO_LE32(n) |
| 95 | +#define ROTR(n) |
| 96 | +#endif |
| 97 | + |
| 98 | +#define FOR_EACH_WORD(x) \ |
| 99 | + x( 0); \ |
| 100 | + x( 1); \ |
| 101 | + x( 2); \ |
| 102 | + x( 3); \ |
| 103 | + x( 4); \ |
| 104 | + x( 5); \ |
| 105 | + x( 6); \ |
| 106 | + x( 7); \ |
| 107 | + x( 8); \ |
| 108 | + x( 9); \ |
| 109 | + x(10); \ |
| 110 | + x(11); \ |
| 111 | + x(12); \ |
| 112 | + x(13); \ |
| 113 | + x(14); \ |
| 114 | + x(15); |
| 115 | + |
| 116 | +#define FOR_EACH_WORD_REV(x) \ |
| 117 | + x(15); \ |
| 118 | + x(14); \ |
| 119 | + x(13); \ |
| 120 | + x(12); \ |
| 121 | + x(11); \ |
| 122 | + x(10); \ |
| 123 | + x( 9); \ |
| 124 | + x( 8); \ |
| 125 | + x( 7); \ |
| 126 | + x( 6); \ |
| 127 | + x( 5); \ |
| 128 | + x( 4); \ |
| 129 | + x( 3); \ |
| 130 | + x( 2); \ |
| 131 | + x( 1); \ |
| 132 | + x( 0); |
| 133 | + |
| 134 | +#define PLUS_ONE_0 1 |
| 135 | +#define PLUS_ONE_1 2 |
| 136 | +#define PLUS_ONE_2 3 |
| 137 | +#define PLUS_ONE_3 4 |
| 138 | +#define PLUS_ONE_4 5 |
| 139 | +#define PLUS_ONE_5 6 |
| 140 | +#define PLUS_ONE_6 7 |
| 141 | +#define PLUS_ONE_7 8 |
| 142 | +#define PLUS_ONE_8 9 |
| 143 | +#define PLUS_ONE_9 10 |
| 144 | +#define PLUS_ONE_10 11 |
| 145 | +#define PLUS_ONE_11 12 |
| 146 | +#define PLUS_ONE_12 13 |
| 147 | +#define PLUS_ONE_13 14 |
| 148 | +#define PLUS_ONE_14 15 |
| 149 | +#define PLUS_ONE_15 16 |
| 150 | +#define PLUS_ONE(x) PLUS_ONE_ ## x |
| 151 | +#define _CONCAT3(a,b,c) a ## b ## c |
| 152 | +#define CONCAT3(a,b,c) _CONCAT3(a,b,c) |
| 153 | + |
| 154 | +#define STORE_UNALIGNED(x) \ |
| 155 | +CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ |
| 156 | + .if (x != 12); \ |
| 157 | + lw T0, (x*4)(STATE); \ |
| 158 | + .endif; \ |
| 159 | + lwl T1, (x*4)+MSB ## (IN); \ |
| 160 | + lwr T1, (x*4)+LSB ## (IN); \ |
| 161 | + .if (x == 12); \ |
| 162 | + addu X ## x, NONCE_0; \ |
| 163 | + .else; \ |
| 164 | + addu X ## x, T0; \ |
| 165 | + .endif; \ |
| 166 | + CPU_TO_LE32(X ## x); \ |
| 167 | + xor X ## x, T1; \ |
| 168 | + swl X ## x, (x*4)+MSB ## (OUT); \ |
| 169 | + swr X ## x, (x*4)+LSB ## (OUT); |
| 170 | + |
| 171 | +#define STORE_ALIGNED(x) \ |
| 172 | +CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ |
| 173 | + .if (x != 12); \ |
| 174 | + lw T0, (x*4)(STATE); \ |
| 175 | + .endif; \ |
| 176 | + lw T1, (x*4) ## (IN); \ |
| 177 | + .if (x == 12); \ |
| 178 | + addu X ## x, NONCE_0; \ |
| 179 | + .else; \ |
| 180 | + addu X ## x, T0; \ |
| 181 | + .endif; \ |
| 182 | + CPU_TO_LE32(X ## x); \ |
| 183 | + xor X ## x, T1; \ |
| 184 | + sw X ## x, (x*4) ## (OUT); |
| 185 | + |
| 186 | +/* Jump table macro. |
| 187 | + * Used for setup and handling the last bytes, which are not multiple of 4. |
| 188 | + * X15 is free to store Xn |
| 189 | + * Every jumptable entry must be equal in size. |
| 190 | + */ |
| 191 | +#define JMPTBL_ALIGNED(x) \ |
| 192 | +.Lchacha20_mips_jmptbl_aligned_ ## x: ; \ |
| 193 | + .set noreorder; \ |
| 194 | + b .Lchacha20_mips_xor_aligned_ ## x ## _b; \ |
| 195 | + .if (x == 12); \ |
| 196 | + addu SAVED_X, X ## x, NONCE_0; \ |
| 197 | + .else; \ |
| 198 | + addu SAVED_X, X ## x, SAVED_CA; \ |
| 199 | + .endif; \ |
| 200 | + .set reorder |
| 201 | + |
| 202 | +#define JMPTBL_UNALIGNED(x) \ |
| 203 | +.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \ |
| 204 | + .set noreorder; \ |
| 205 | + b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \ |
| 206 | + .if (x == 12); \ |
| 207 | + addu SAVED_X, X ## x, NONCE_0; \ |
| 208 | + .else; \ |
| 209 | + addu SAVED_X, X ## x, SAVED_CA; \ |
| 210 | + .endif; \ |
| 211 | + .set reorder |
| 212 | + |
| 213 | +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ |
| 214 | + addu X(A), X(K); \ |
| 215 | + addu X(B), X(L); \ |
| 216 | + addu X(C), X(M); \ |
| 217 | + addu X(D), X(N); \ |
| 218 | + xor X(V), X(A); \ |
| 219 | + xor X(W), X(B); \ |
| 220 | + xor X(Y), X(C); \ |
| 221 | + xor X(Z), X(D); \ |
| 222 | + rotl X(V), S; \ |
| 223 | + rotl X(W), S; \ |
| 224 | + rotl X(Y), S; \ |
| 225 | + rotl X(Z), S; |
| 226 | + |
| 227 | +.text |
| 228 | +.set reorder |
| 229 | +.set noat |
| 230 | +.globl chacha20_mips |
| 231 | +.ent chacha20_mips |
| 232 | +chacha20_mips: |
| 233 | + .frame $sp, STACK_SIZE, $ra |
| 234 | + |
| 235 | + addiu $sp, -STACK_SIZE |
| 236 | + |
| 237 | + /* Return bytes = 0. */ |
| 238 | + beqz BYTES, .Lchacha20_mips_end |
| 239 | + |
| 240 | + lw NONCE_0, 48(STATE) |
| 241 | + |
| 242 | + /* Save s0-s7 */ |
| 243 | + sw $s0, 0($sp) |
| 244 | + sw $s1, 4($sp) |
| 245 | + sw $s2, 8($sp) |
| 246 | + sw $s3, 12($sp) |
| 247 | + sw $s4, 16($sp) |
| 248 | + sw $s5, 20($sp) |
| 249 | + sw $s6, 24($sp) |
| 250 | + sw $s7, 28($sp) |
| 251 | + |
| 252 | + /* Test IN or OUT is unaligned. |
| 253 | + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 |
| 254 | + */ |
| 255 | + or IS_UNALIGNED, IN, OUT |
| 256 | + andi IS_UNALIGNED, 0x3 |
| 257 | + |
| 258 | + /* Set number of rounds */ |
| 259 | + li $at, 20 |
| 260 | + |
| 261 | + b .Lchacha20_rounds_start |
| 262 | + |
| 263 | +.align 4 |
| 264 | +.Loop_chacha20_rounds: |
| 265 | + addiu IN, CHACHA20_BLOCK_SIZE |
| 266 | + addiu OUT, CHACHA20_BLOCK_SIZE |
| 267 | + addiu NONCE_0, 1 |
| 268 | + |
| 269 | +.Lchacha20_rounds_start: |
| 270 | + lw X0, 0(STATE) |
| 271 | + lw X1, 4(STATE) |
| 272 | + lw X2, 8(STATE) |
| 273 | + lw X3, 12(STATE) |
| 274 | + |
| 275 | + lw X4, 16(STATE) |
| 276 | + lw X5, 20(STATE) |
| 277 | + lw X6, 24(STATE) |
| 278 | + lw X7, 28(STATE) |
| 279 | + lw X8, 32(STATE) |
| 280 | + lw X9, 36(STATE) |
| 281 | + lw X10, 40(STATE) |
| 282 | + lw X11, 44(STATE) |
| 283 | + |
| 284 | + move X12, NONCE_0 |
| 285 | + lw X13, 52(STATE) |
| 286 | + lw X14, 56(STATE) |
| 287 | + lw X15, 60(STATE) |
| 288 | + |
| 289 | +.Loop_chacha20_xor_rounds: |
| 290 | + addiu $at, -2 |
| 291 | + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); |
| 292 | + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); |
| 293 | + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); |
| 294 | + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); |
| 295 | + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); |
| 296 | + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); |
| 297 | + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); |
| 298 | + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); |
| 299 | + bnez $at, .Loop_chacha20_xor_rounds |
| 300 | + |
| 301 | + addiu BYTES, -(CHACHA20_BLOCK_SIZE) |
| 302 | + |
| 303 | + /* Is data src/dst unaligned? Jump */ |
| 304 | + bnez IS_UNALIGNED, .Loop_chacha20_unaligned |
| 305 | + |
| 306 | + /* Set number rounds here to fill delayslot. */ |
| 307 | + li $at, 20 |
| 308 | + |
| 309 | + /* BYTES < 0, it has no full block. */ |
| 310 | + bltz BYTES, .Lchacha20_mips_no_full_block_aligned |
| 311 | + |
| 312 | + FOR_EACH_WORD_REV(STORE_ALIGNED) |
| 313 | + |
| 314 | + /* BYTES > 0? Loop again. */ |
| 315 | + bgtz BYTES, .Loop_chacha20_rounds |
| 316 | + |
| 317 | + /* Place this here to fill delay slot */ |
| 318 | + addiu NONCE_0, 1 |
| 319 | + |
| 320 | + /* BYTES < 0? Handle last bytes */ |
| 321 | + bltz BYTES, .Lchacha20_mips_xor_bytes |
| 322 | + |
| 323 | +.Lchacha20_mips_xor_done: |
| 324 | + /* Restore used registers */ |
| 325 | + lw $s0, 0($sp) |
| 326 | + lw $s1, 4($sp) |
| 327 | + lw $s2, 8($sp) |
| 328 | + lw $s3, 12($sp) |
| 329 | + lw $s4, 16($sp) |
| 330 | + lw $s5, 20($sp) |
| 331 | + lw $s6, 24($sp) |
| 332 | + lw $s7, 28($sp) |
| 333 | + |
| 334 | + /* Write NONCE_0 back to right location in state */ |
| 335 | + sw NONCE_0, 48(STATE) |
| 336 | + |
| 337 | +.Lchacha20_mips_end: |
| 338 | + addiu $sp, STACK_SIZE |
| 339 | + jr $ra |
| 340 | + |
| 341 | +.Lchacha20_mips_no_full_block_aligned: |
| 342 | + /* Restore the offset on BYTES */ |
| 343 | + addiu BYTES, CHACHA20_BLOCK_SIZE |
| 344 | + |
| 345 | + /* Get number of full WORDS */ |
| 346 | + andi $at, BYTES, MASK_U32 |
| 347 | + |
| 348 | + /* Load upper half of jump table addr */ |
| 349 | + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) |
| 350 | + |
| 351 | + /* Calculate lower half jump table offset */ |
| 352 | + ins T0, $at, 1, 6 |
| 353 | + |
| 354 | + /* Add offset to STATE */ |
| 355 | + addu T1, STATE, $at |
| 356 | + |
| 357 | + /* Add lower half jump table addr */ |
| 358 | + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) |
| 359 | + |
| 360 | + /* Read value from STATE */ |
| 361 | + lw SAVED_CA, 0(T1) |
| 362 | + |
| 363 | + /* Store remaining bytecounter as negative value */ |
| 364 | + subu BYTES, $at, BYTES |
| 365 | + |
| 366 | + jr T0 |
| 367 | + |
| 368 | + /* Jump table */ |
| 369 | + FOR_EACH_WORD(JMPTBL_ALIGNED) |
| 370 | + |
| 371 | + |
| 372 | +.Loop_chacha20_unaligned: |
| 373 | + /* Set number rounds here to fill delayslot. */ |
| 374 | + li $at, 20 |
| 375 | + |
| 376 | + /* BYTES > 0, it has no full block. */ |
| 377 | + bltz BYTES, .Lchacha20_mips_no_full_block_unaligned |
| 378 | + |
| 379 | + FOR_EACH_WORD_REV(STORE_UNALIGNED) |
| 380 | + |
| 381 | + /* BYTES > 0? Loop again. */ |
| 382 | + bgtz BYTES, .Loop_chacha20_rounds |
| 383 | + |
| 384 | + /* Write NONCE_0 back to right location in state */ |
| 385 | + sw NONCE_0, 48(STATE) |
| 386 | + |
| 387 | + .set noreorder |
| 388 | + /* Fall through to byte handling */ |
| 389 | + bgez BYTES, .Lchacha20_mips_xor_done |
| 390 | +.Lchacha20_mips_xor_unaligned_0_b: |
| 391 | +.Lchacha20_mips_xor_aligned_0_b: |
| 392 | + /* Place this here to fill delay slot */ |
| 393 | + addiu NONCE_0, 1 |
| 394 | + .set reorder |
| 395 | + |
| 396 | +.Lchacha20_mips_xor_bytes: |
| 397 | + addu IN, $at |
| 398 | + addu OUT, $at |
| 399 | + /* First byte */ |
| 400 | + lbu T1, 0(IN) |
| 401 | + addiu $at, BYTES, 1 |
| 402 | + CPU_TO_LE32(SAVED_X) |
| 403 | + ROTR(SAVED_X) |
| 404 | + xor T1, SAVED_X |
| 405 | + sb T1, 0(OUT) |
| 406 | + beqz $at, .Lchacha20_mips_xor_done |
| 407 | + /* Second byte */ |
| 408 | + lbu T1, 1(IN) |
| 409 | + addiu $at, BYTES, 2 |
| 410 | + ROTx SAVED_X, 8 |
| 411 | + xor T1, SAVED_X |
| 412 | + sb T1, 1(OUT) |
| 413 | + beqz $at, .Lchacha20_mips_xor_done |
| 414 | + /* Third byte */ |
| 415 | + lbu T1, 2(IN) |
| 416 | + ROTx SAVED_X, 8 |
| 417 | + xor T1, SAVED_X |
| 418 | + sb T1, 2(OUT) |
| 419 | + b .Lchacha20_mips_xor_done |
| 420 | + |
| 421 | +.Lchacha20_mips_no_full_block_unaligned: |
| 422 | + /* Restore the offset on BYTES */ |
| 423 | + addiu BYTES, CHACHA20_BLOCK_SIZE |
| 424 | + |
| 425 | + /* Get number of full WORDS */ |
| 426 | + andi $at, BYTES, MASK_U32 |
| 427 | + |
| 428 | + /* Load upper half of jump table addr */ |
| 429 | + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) |
| 430 | + |
| 431 | + /* Calculate lower half jump table offset */ |
| 432 | + ins T0, $at, 1, 6 |
| 433 | + |
| 434 | + /* Add offset to STATE */ |
| 435 | + addu T1, STATE, $at |
| 436 | + |
| 437 | + /* Add lower half jump table addr */ |
| 438 | + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) |
| 439 | + |
| 440 | + /* Read value from STATE */ |
| 441 | + lw SAVED_CA, 0(T1) |
| 442 | + |
| 443 | + /* Store remaining bytecounter as negative value */ |
| 444 | + subu BYTES, $at, BYTES |
| 445 | + |
| 446 | + jr T0 |
| 447 | + |
| 448 | + /* Jump table */ |
| 449 | + FOR_EACH_WORD(JMPTBL_UNALIGNED) |
| 450 | +.end chacha20_mips |
| 451 | +.set at |