target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch - T108 - Gitiles

 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
 Date: Fri, 8 Nov 2019 13:22:16 +0100
 Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit

 commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.

 This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
 Zinc patch set.

 Co-developed-by: René van Dorst <opensource@vdorst.com>
 Signed-off-by: René van Dorst <opensource@vdorst.com>
 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 ---
  arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
  1 file changed, 424 insertions(+)
  create mode 100644 arch/mips/crypto/chacha-core.S

 --- /dev/null
 +++ b/arch/mips/crypto/chacha-core.S
 @@ -0,0 +1,424 @@
 +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
 +/*
 + * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
 + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 + */
 +
 +#define MASK_U32		0x3c
 +#define CHACHA20_BLOCK_SIZE	64
 +#define STACK_SIZE		32
 +
 +#define X0	$t0
 +#define X1	$t1
 +#define X2	$t2
 +#define X3	$t3
 +#define X4	$t4
 +#define X5	$t5
 +#define X6	$t6
 +#define X7	$t7
 +#define X8	$t8
 +#define X9	$t9
 +#define X10	$v1
 +#define X11	$s6
 +#define X12	$s5
 +#define X13	$s4
 +#define X14	$s3
 +#define X15	$s2
 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
 +#define T0	$s1
 +#define T1	$s0
 +#define T(n)	T ## n
 +#define X(n)	X ## n
 +
 +/* Input arguments */
 +#define STATE		$a0
 +#define OUT		$a1
 +#define IN		$a2
 +#define BYTES		$a3
 +
 +/* Output argument */
 +/* NONCE[0] is kept in a register and not in memory.
 + * We don't want to touch original value in memory.
 + * Must be incremented every loop iteration.
 + */
 +#define NONCE_0		$v0
 +
 +/* SAVED_X and SAVED_CA are set in the jump table.
 + * Use regs which are overwritten on exit else we don't leak clear data.
 + * They are used to handling the last bytes which are not multiple of 4.
 + */
 +#define SAVED_X		X15
 +#define SAVED_CA	$s7
 +
 +#define IS_UNALIGNED	$s7
 +
 +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 +#define MSB 0
 +#define LSB 3
 +#define ROTx rotl
 +#define ROTR(n) rotr n, 24
 +#define	CPU_TO_LE32(n) \
 +	wsbh	n; \
 +	rotr	n, 16;
 +#else
 +#define MSB 3
 +#define LSB 0
 +#define ROTx rotr
 +#define CPU_TO_LE32(n)
 +#define ROTR(n)
 +#endif
 +
 +#define FOR_EACH_WORD(x) \
 +	x( 0); \
 +	x( 1); \
 +	x( 2); \
 +	x( 3); \
 +	x( 4); \
 +	x( 5); \
 +	x( 6); \
 +	x( 7); \
 +	x( 8); \
 +	x( 9); \
 +	x(10); \
 +	x(11); \
 +	x(12); \
 +	x(13); \
 +	x(14); \
 +	x(15);
 +
 +#define FOR_EACH_WORD_REV(x) \
 +	x(15); \
 +	x(14); \
 +	x(13); \
 +	x(12); \
 +	x(11); \
 +	x(10); \
 +	x( 9); \
 +	x( 8); \
 +	x( 7); \
 +	x( 6); \
 +	x( 5); \
 +	x( 4); \
 +	x( 3); \
 +	x( 2); \
 +	x( 1); \
 +	x( 0);
 +
 +#define PLUS_ONE_0	 1
 +#define PLUS_ONE_1	 2
 +#define PLUS_ONE_2	 3
 +#define PLUS_ONE_3	 4
 +#define PLUS_ONE_4	 5
 +#define PLUS_ONE_5	 6
 +#define PLUS_ONE_6	 7
 +#define PLUS_ONE_7	 8
 +#define PLUS_ONE_8	 9
 +#define PLUS_ONE_9	10
 +#define PLUS_ONE_10	11
 +#define PLUS_ONE_11	12
 +#define PLUS_ONE_12	13
 +#define PLUS_ONE_13	14
 +#define PLUS_ONE_14	15
 +#define PLUS_ONE_15	16
 +#define PLUS_ONE(x)	PLUS_ONE_ ## x
 +#define _CONCAT3(a,b,c)	a ## b ## c
 +#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
 +
 +#define STORE_UNALIGNED(x) \
 +CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
 +	.if (x != 12); \
 +		lw	T0, (x*4)(STATE); \
 +	.endif; \
 +	lwl	T1, (x*4)+MSB ## (IN); \
 +	lwr	T1, (x*4)+LSB ## (IN); \
 +	.if (x == 12); \
 +		addu	X ## x, NONCE_0; \
 +	.else; \
 +		addu	X ## x, T0; \
 +	.endif; \
 +	CPU_TO_LE32(X ## x); \
 +	xor	X ## x, T1; \
 +	swl	X ## x, (x*4)+MSB ## (OUT); \
 +	swr	X ## x, (x*4)+LSB ## (OUT);
 +
 +#define STORE_ALIGNED(x) \
 +CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
 +	.if (x != 12); \
 +		lw	T0, (x*4)(STATE); \
 +	.endif; \
 +	lw	T1, (x*4) ## (IN); \
 +	.if (x == 12); \
 +		addu	X ## x, NONCE_0; \
 +	.else; \
 +		addu	X ## x, T0; \
 +	.endif; \
 +	CPU_TO_LE32(X ## x); \
 +	xor	X ## x, T1; \
 +	sw	X ## x, (x*4) ## (OUT);
 +
 +/* Jump table macro.
 + * Used for setup and handling the last bytes, which are not multiple of 4.
 + * X15 is free to store Xn
 + * Every jumptable entry must be equal in size.
 + */
 +#define JMPTBL_ALIGNED(x) \
 +.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
 +	.set	noreorder; \
 +	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
 +	.if (x == 12); \
 +		addu	SAVED_X, X ## x, NONCE_0; \
 +	.else; \
 +		addu	SAVED_X, X ## x, SAVED_CA; \
 +	.endif; \
 +	.set	reorder
 +
 +#define JMPTBL_UNALIGNED(x) \
 +.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
 +	.set	noreorder; \
 +	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
 +	.if (x == 12); \
 +		addu	SAVED_X, X ## x, NONCE_0; \
 +	.else; \
 +		addu	SAVED_X, X ## x, SAVED_CA; \
 +	.endif; \
 +	.set	reorder
 +
 +#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
 +	addu	X(A), X(K); \
 +	addu	X(B), X(L); \
 +	addu	X(C), X(M); \
 +	addu	X(D), X(N); \
 +	xor	X(V), X(A); \
 +	xor	X(W), X(B); \
 +	xor	X(Y), X(C); \
 +	xor	X(Z), X(D); \
 +	rotl	X(V), S;    \
 +	rotl	X(W), S;    \
 +	rotl	X(Y), S;    \
 +	rotl	X(Z), S;
 +
 +.text
 +.set	reorder
 +.set	noat
 +.globl	chacha20_mips
 +.ent	chacha20_mips
 +chacha20_mips:
 +	.frame	$sp, STACK_SIZE, $ra
 +
 +	addiu	$sp, -STACK_SIZE
 +
 +	/* Return bytes = 0. */
 +	beqz	BYTES, .Lchacha20_mips_end
 +
 +	lw	NONCE_0, 48(STATE)
 +
 +	/* Save s0-s7 */
 +	sw	$s0,  0($sp)
 +	sw	$s1,  4($sp)
 +	sw	$s2,  8($sp)
 +	sw	$s3, 12($sp)
 +	sw	$s4, 16($sp)
 +	sw	$s5, 20($sp)
 +	sw	$s6, 24($sp)
 +	sw	$s7, 28($sp)
 +
 +	/* Test IN or OUT is unaligned.
 +	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
 +	 */
 +	or	IS_UNALIGNED, IN, OUT
 +	andi	IS_UNALIGNED, 0x3
 +
 +	/* Set number of rounds */
 +	li	$at, 20
 +
 +	b	.Lchacha20_rounds_start
 +
 +.align 4
 +.Loop_chacha20_rounds:
 +	addiu	IN,  CHACHA20_BLOCK_SIZE
 +	addiu	OUT, CHACHA20_BLOCK_SIZE
 +	addiu	NONCE_0, 1
 +
 +.Lchacha20_rounds_start:
 +	lw	X0,  0(STATE)
 +	lw	X1,  4(STATE)
 +	lw	X2,  8(STATE)
 +	lw	X3,  12(STATE)
 +
 +	lw	X4,  16(STATE)
 +	lw	X5,  20(STATE)
 +	lw	X6,  24(STATE)
 +	lw	X7,  28(STATE)
 +	lw	X8,  32(STATE)
 +	lw	X9,  36(STATE)
 +	lw	X10, 40(STATE)
 +	lw	X11, 44(STATE)
 +
 +	move	X12, NONCE_0
 +	lw	X13, 52(STATE)
 +	lw	X14, 56(STATE)
 +	lw	X15, 60(STATE)
 +
 +.Loop_chacha20_xor_rounds:
 +	addiu	$at, -2
 +	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
 +	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
 +	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
 +	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
 +	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
 +	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
 +	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
 +	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
 +	bnez	$at, .Loop_chacha20_xor_rounds
 +
 +	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
 +
 +	/* Is data src/dst unaligned? Jump */
 +	bnez	IS_UNALIGNED, .Loop_chacha20_unaligned
 +
 +	/* Set number rounds here to fill delayslot. */
 +	li	$at, 20
 +
 +	/* BYTES < 0, it has no full block. */
 +	bltz	BYTES, .Lchacha20_mips_no_full_block_aligned
 +
 +	FOR_EACH_WORD_REV(STORE_ALIGNED)
 +
 +	/* BYTES > 0? Loop again. */
 +	bgtz	BYTES, .Loop_chacha20_rounds
 +
 +	/* Place this here to fill delay slot */
 +	addiu	NONCE_0, 1
 +
 +	/* BYTES < 0? Handle last bytes */
 +	bltz	BYTES, .Lchacha20_mips_xor_bytes
 +
 +.Lchacha20_mips_xor_done:
 +	/* Restore used registers */
 +	lw	$s0,  0($sp)
 +	lw	$s1,  4($sp)
 +	lw	$s2,  8($sp)
 +	lw	$s3, 12($sp)
 +	lw	$s4, 16($sp)
 +	lw	$s5, 20($sp)
 +	lw	$s6, 24($sp)
 +	lw	$s7, 28($sp)
 +
 +	/* Write NONCE_0 back to right location in state */
 +	sw	NONCE_0, 48(STATE)
 +
 +.Lchacha20_mips_end:
 +	addiu	$sp, STACK_SIZE
 +	jr	$ra
 +
 +.Lchacha20_mips_no_full_block_aligned:
 +	/* Restore the offset on BYTES */
 +	addiu	BYTES, CHACHA20_BLOCK_SIZE
 +
 +	/* Get number of full WORDS */
 +	andi	$at, BYTES, MASK_U32
 +
 +	/* Load upper half of jump table addr */
 +	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
 +
 +	/* Calculate lower half jump table offset */
 +	ins	T0, $at, 1, 6
 +
 +	/* Add offset to STATE */
 +	addu	T1, STATE, $at
 +
 +	/* Add lower half jump table addr */
 +	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
 +
 +	/* Read value from STATE */
 +	lw	SAVED_CA, 0(T1)
 +
 +	/* Store remaining bytecounter as negative value */
 +	subu	BYTES, $at, BYTES
 +
 +	jr	T0
 +
 +	/* Jump table */
 +	FOR_EACH_WORD(JMPTBL_ALIGNED)
 +
 +
 +.Loop_chacha20_unaligned:
 +	/* Set number rounds here to fill delayslot. */
 +	li	$at, 20
 +
 +	/* BYTES > 0, it has no full block. */
 +	bltz	BYTES, .Lchacha20_mips_no_full_block_unaligned
 +
 +	FOR_EACH_WORD_REV(STORE_UNALIGNED)
 +
 +	/* BYTES > 0? Loop again. */
 +	bgtz	BYTES, .Loop_chacha20_rounds
 +
 +	/* Write NONCE_0 back to right location in state */
 +	sw	NONCE_0, 48(STATE)
 +
 +	.set noreorder
 +	/* Fall through to byte handling */
 +	bgez	BYTES, .Lchacha20_mips_xor_done
 +.Lchacha20_mips_xor_unaligned_0_b:
 +.Lchacha20_mips_xor_aligned_0_b:
 +	/* Place this here to fill delay slot */
 +	addiu	NONCE_0, 1
 +	.set reorder
 +
 +.Lchacha20_mips_xor_bytes:
 +	addu	IN, $at
 +	addu	OUT, $at
 +	/* First byte */
 +	lbu	T1, 0(IN)
 +	addiu	$at, BYTES, 1
 +	CPU_TO_LE32(SAVED_X)
 +	ROTR(SAVED_X)
 +	xor	T1, SAVED_X
 +	sb	T1, 0(OUT)
 +	beqz	$at, .Lchacha20_mips_xor_done
 +	/* Second byte */
 +	lbu	T1, 1(IN)
 +	addiu	$at, BYTES, 2
 +	ROTx	SAVED_X, 8
 +	xor	T1, SAVED_X
 +	sb	T1, 1(OUT)
 +	beqz	$at, .Lchacha20_mips_xor_done
 +	/* Third byte */
 +	lbu	T1, 2(IN)
 +	ROTx	SAVED_X, 8
 +	xor	T1, SAVED_X
 +	sb	T1, 2(OUT)
 +	b	.Lchacha20_mips_xor_done
 +
 +.Lchacha20_mips_no_full_block_unaligned:
 +	/* Restore the offset on BYTES */
 +	addiu	BYTES, CHACHA20_BLOCK_SIZE
 +
 +	/* Get number of full WORDS */
 +	andi	$at, BYTES, MASK_U32
 +
 +	/* Load upper half of jump table addr */
 +	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
 +
 +	/* Calculate lower half jump table offset */
 +	ins	T0, $at, 1, 6
 +
 +	/* Add offset to STATE */
 +	addu	T1, STATE, $at
 +
 +	/* Add lower half jump table addr */
 +	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
 +
 +	/* Read value from STATE */
 +	lw	SAVED_CA, 0(T1)
 +
 +	/* Store remaining bytecounter as negative value */
 +	subu	BYTES, $at, BYTES
 +
 +	jr	T0
 +
 +	/* Jump table */
 +	FOR_EACH_WORD(JMPTBL_UNALIGNED)
 +.end chacha20_mips
 +.set at
	From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
	From: "Jason A. Donenfeld" <Jason@zx2c4.com>
	Date: Fri, 8 Nov 2019 13:22:16 +0100
	Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
	MIME-Version: 1.0
	Content-Type: text/plain; charset=UTF-8
	Content-Transfer-Encoding: 8bit

	commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.

	This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
	Zinc patch set.

	Co-developed-by: René van Dorst <opensource@vdorst.com>
	Signed-off-by: René van Dorst <opensource@vdorst.com>
	Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
	Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
	Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
	Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
	---
	arch/mips/crypto/chacha-core.S \| 424 +++++++++++++++++++++++++++++++++
	1 file changed, 424 insertions(+)
	create mode 100644 arch/mips/crypto/chacha-core.S

	--- /dev/null
	+++ b/arch/mips/crypto/chacha-core.S
	@@ -0,0 +1,424 @@
	+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
	+/*
	+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
	+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
	+ */
	+
	+#define MASK_U32 0x3c
	+#define CHACHA20_BLOCK_SIZE 64
	+#define STACK_SIZE 32
	+
	+#define X0 $t0
	+#define X1 $t1
	+#define X2 $t2
	+#define X3 $t3
	+#define X4 $t4
	+#define X5 $t5
	+#define X6 $t6
	+#define X7 $t7
	+#define X8 $t8
	+#define X9 $t9
	+#define X10 $v1
	+#define X11 $s6
	+#define X12 $s5
	+#define X13 $s4
	+#define X14 $s3
	+#define X15 $s2
	+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
	+#define T0 $s1
	+#define T1 $s0
	+#define T(n) T ## n
	+#define X(n) X ## n
	+
	+/* Input arguments */
	+#define STATE $a0
	+#define OUT $a1
	+#define IN $a2
	+#define BYTES $a3
	+
	+/* Output argument */
	+/* NONCE[0] is kept in a register and not in memory.
	+ * We don't want to touch original value in memory.
	+ * Must be incremented every loop iteration.
	+ */
	+#define NONCE_0 $v0
	+
	+/* SAVED_X and SAVED_CA are set in the jump table.
	+ * Use regs which are overwritten on exit else we don't leak clear data.
	+ * They are used to handling the last bytes which are not multiple of 4.
	+ */
	+#define SAVED_X X15
	+#define SAVED_CA $s7
	+
	+#define IS_UNALIGNED $s7
	+
	+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
	+#define MSB 0
	+#define LSB 3
	+#define ROTx rotl
	+#define ROTR(n) rotr n, 24
	+#define CPU_TO_LE32(n) \
	+ wsbh n; \
	+ rotr n, 16;
	+#else
	+#define MSB 3
	+#define LSB 0
	+#define ROTx rotr
	+#define CPU_TO_LE32(n)
	+#define ROTR(n)
	+#endif
	+
	+#define FOR_EACH_WORD(x) \
	+ x( 0); \
	+ x( 1); \
	+ x( 2); \
	+ x( 3); \
	+ x( 4); \
	+ x( 5); \
	+ x( 6); \
	+ x( 7); \
	+ x( 8); \
	+ x( 9); \
	+ x(10); \
	+ x(11); \
	+ x(12); \
	+ x(13); \
	+ x(14); \
	+ x(15);
	+
	+#define FOR_EACH_WORD_REV(x) \
	+ x(15); \
	+ x(14); \
	+ x(13); \
	+ x(12); \
	+ x(11); \
	+ x(10); \
	+ x( 9); \
	+ x( 8); \
	+ x( 7); \
	+ x( 6); \
	+ x( 5); \
	+ x( 4); \
	+ x( 3); \
	+ x( 2); \
	+ x( 1); \
	+ x( 0);
	+
	+#define PLUS_ONE_0 1
	+#define PLUS_ONE_1 2
	+#define PLUS_ONE_2 3
	+#define PLUS_ONE_3 4
	+#define PLUS_ONE_4 5
	+#define PLUS_ONE_5 6
	+#define PLUS_ONE_6 7
	+#define PLUS_ONE_7 8
	+#define PLUS_ONE_8 9
	+#define PLUS_ONE_9 10
	+#define PLUS_ONE_10 11
	+#define PLUS_ONE_11 12
	+#define PLUS_ONE_12 13
	+#define PLUS_ONE_13 14
	+#define PLUS_ONE_14 15
	+#define PLUS_ONE_15 16
	+#define PLUS_ONE(x) PLUS_ONE_ ## x
	+#define _CONCAT3(a,b,c) a ## b ## c
	+#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
	+
	+#define STORE_UNALIGNED(x) \
	+CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
	+ .if (x != 12); \
	+ lw T0, (x*4)(STATE); \
	+ .endif; \
	+ lwl T1, (x*4)+MSB ## (IN); \
	+ lwr T1, (x*4)+LSB ## (IN); \
	+ .if (x == 12); \
	+ addu X ## x, NONCE_0; \
	+ .else; \
	+ addu X ## x, T0; \
	+ .endif; \
	+ CPU_TO_LE32(X ## x); \
	+ xor X ## x, T1; \
	+ swl X ## x, (x*4)+MSB ## (OUT); \
	+ swr X ## x, (x*4)+LSB ## (OUT);
	+
	+#define STORE_ALIGNED(x) \
	+CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
	+ .if (x != 12); \
	+ lw T0, (x*4)(STATE); \
	+ .endif; \
	+ lw T1, (x*4) ## (IN); \
	+ .if (x == 12); \
	+ addu X ## x, NONCE_0; \
	+ .else; \
	+ addu X ## x, T0; \
	+ .endif; \
	+ CPU_TO_LE32(X ## x); \
	+ xor X ## x, T1; \
	+ sw X ## x, (x*4) ## (OUT);
	+
	+/* Jump table macro.
	+ * Used for setup and handling the last bytes, which are not multiple of 4.
	+ * X15 is free to store Xn
	+ * Every jumptable entry must be equal in size.
	+ */
	+#define JMPTBL_ALIGNED(x) \
	+.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
	+ .set noreorder; \
	+ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
	+ .if (x == 12); \
	+ addu SAVED_X, X ## x, NONCE_0; \
	+ .else; \
	+ addu SAVED_X, X ## x, SAVED_CA; \
	+ .endif; \
	+ .set reorder
	+
	+#define JMPTBL_UNALIGNED(x) \
	+.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
	+ .set noreorder; \
	+ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
	+ .if (x == 12); \
	+ addu SAVED_X, X ## x, NONCE_0; \
	+ .else; \
	+ addu SAVED_X, X ## x, SAVED_CA; \
	+ .endif; \
	+ .set reorder
	+
	+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
	+ addu X(A), X(K); \
	+ addu X(B), X(L); \
	+ addu X(C), X(M); \
	+ addu X(D), X(N); \
	+ xor X(V), X(A); \
	+ xor X(W), X(B); \
	+ xor X(Y), X(C); \
	+ xor X(Z), X(D); \
	+ rotl X(V), S; \
	+ rotl X(W), S; \
	+ rotl X(Y), S; \
	+ rotl X(Z), S;
	+
	+.text
	+.set reorder
	+.set noat
	+.globl chacha20_mips
	+.ent chacha20_mips
	+chacha20_mips:
	+ .frame $sp, STACK_SIZE, $ra
	+
	+ addiu $sp, -STACK_SIZE
	+
	+ /* Return bytes = 0. */
	+ beqz BYTES, .Lchacha20_mips_end
	+
	+ lw NONCE_0, 48(STATE)
	+
	+ /* Save s0-s7 */
	+ sw $s0, 0($sp)
	+ sw $s1, 4($sp)
	+ sw $s2, 8($sp)
	+ sw $s3, 12($sp)
	+ sw $s4, 16($sp)
	+ sw $s5, 20($sp)
	+ sw $s6, 24($sp)
	+ sw $s7, 28($sp)
	+
	+ /* Test IN or OUT is unaligned.
	+ * IS_UNALIGNED = ( IN \| OUT ) & 0x00000003
	+ */
	+ or IS_UNALIGNED, IN, OUT
	+ andi IS_UNALIGNED, 0x3
	+
	+ /* Set number of rounds */
	+ li $at, 20
	+
	+ b .Lchacha20_rounds_start
	+
	+.align 4
	+.Loop_chacha20_rounds:
	+ addiu IN, CHACHA20_BLOCK_SIZE
	+ addiu OUT, CHACHA20_BLOCK_SIZE
	+ addiu NONCE_0, 1
	+
	+.Lchacha20_rounds_start:
	+ lw X0, 0(STATE)
	+ lw X1, 4(STATE)
	+ lw X2, 8(STATE)
	+ lw X3, 12(STATE)
	+
	+ lw X4, 16(STATE)
	+ lw X5, 20(STATE)
	+ lw X6, 24(STATE)
	+ lw X7, 28(STATE)
	+ lw X8, 32(STATE)
	+ lw X9, 36(STATE)
	+ lw X10, 40(STATE)
	+ lw X11, 44(STATE)
	+
	+ move X12, NONCE_0
	+ lw X13, 52(STATE)
	+ lw X14, 56(STATE)
	+ lw X15, 60(STATE)
	+
	+.Loop_chacha20_xor_rounds:
	+ addiu $at, -2
	+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
	+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
	+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
	+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
	+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
	+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
	+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
	+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
	+ bnez $at, .Loop_chacha20_xor_rounds
	+
	+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
	+
	+ /* Is data src/dst unaligned? Jump */
	+ bnez IS_UNALIGNED, .Loop_chacha20_unaligned
	+
	+ /* Set number rounds here to fill delayslot. */
	+ li $at, 20
	+
	+ /* BYTES < 0, it has no full block. */
	+ bltz BYTES, .Lchacha20_mips_no_full_block_aligned
	+
	+ FOR_EACH_WORD_REV(STORE_ALIGNED)
	+
	+ /* BYTES > 0? Loop again. */
	+ bgtz BYTES, .Loop_chacha20_rounds
	+
	+ /* Place this here to fill delay slot */
	+ addiu NONCE_0, 1
	+
	+ /* BYTES < 0? Handle last bytes */
	+ bltz BYTES, .Lchacha20_mips_xor_bytes
	+
	+.Lchacha20_mips_xor_done:
	+ /* Restore used registers */
	+ lw $s0, 0($sp)
	+ lw $s1, 4($sp)
	+ lw $s2, 8($sp)
	+ lw $s3, 12($sp)
	+ lw $s4, 16($sp)
	+ lw $s5, 20($sp)
	+ lw $s6, 24($sp)
	+ lw $s7, 28($sp)
	+
	+ /* Write NONCE_0 back to right location in state */
	+ sw NONCE_0, 48(STATE)
	+
	+.Lchacha20_mips_end:
	+ addiu $sp, STACK_SIZE
	+ jr $ra
	+
	+.Lchacha20_mips_no_full_block_aligned:
	+ /* Restore the offset on BYTES */
	+ addiu BYTES, CHACHA20_BLOCK_SIZE
	+
	+ /* Get number of full WORDS */
	+ andi $at, BYTES, MASK_U32
	+
	+ /* Load upper half of jump table addr */
	+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
	+
	+ /* Calculate lower half jump table offset */
	+ ins T0, $at, 1, 6
	+
	+ /* Add offset to STATE */
	+ addu T1, STATE, $at
	+
	+ /* Add lower half jump table addr */
	+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
	+
	+ /* Read value from STATE */
	+ lw SAVED_CA, 0(T1)
	+
	+ /* Store remaining bytecounter as negative value */
	+ subu BYTES, $at, BYTES
	+
	+ jr T0
	+
	+ /* Jump table */
	+ FOR_EACH_WORD(JMPTBL_ALIGNED)
	+
	+
	+.Loop_chacha20_unaligned:
	+ /* Set number rounds here to fill delayslot. */
	+ li $at, 20
	+
	+ /* BYTES > 0, it has no full block. */
	+ bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
	+
	+ FOR_EACH_WORD_REV(STORE_UNALIGNED)
	+
	+ /* BYTES > 0? Loop again. */
	+ bgtz BYTES, .Loop_chacha20_rounds
	+
	+ /* Write NONCE_0 back to right location in state */
	+ sw NONCE_0, 48(STATE)
	+
	+ .set noreorder
	+ /* Fall through to byte handling */
	+ bgez BYTES, .Lchacha20_mips_xor_done
	+.Lchacha20_mips_xor_unaligned_0_b:
	+.Lchacha20_mips_xor_aligned_0_b:
	+ /* Place this here to fill delay slot */
	+ addiu NONCE_0, 1
	+ .set reorder
	+
	+.Lchacha20_mips_xor_bytes:
	+ addu IN, $at
	+ addu OUT, $at
	+ /* First byte */
	+ lbu T1, 0(IN)
	+ addiu $at, BYTES, 1
	+ CPU_TO_LE32(SAVED_X)
	+ ROTR(SAVED_X)
	+ xor T1, SAVED_X
	+ sb T1, 0(OUT)
	+ beqz $at, .Lchacha20_mips_xor_done
	+ /* Second byte */
	+ lbu T1, 1(IN)
	+ addiu $at, BYTES, 2
	+ ROTx SAVED_X, 8
	+ xor T1, SAVED_X
	+ sb T1, 1(OUT)
	+ beqz $at, .Lchacha20_mips_xor_done
	+ /* Third byte */
	+ lbu T1, 2(IN)
	+ ROTx SAVED_X, 8
	+ xor T1, SAVED_X
	+ sb T1, 2(OUT)
	+ b .Lchacha20_mips_xor_done
	+
	+.Lchacha20_mips_no_full_block_unaligned:
	+ /* Restore the offset on BYTES */
	+ addiu BYTES, CHACHA20_BLOCK_SIZE
	+
	+ /* Get number of full WORDS */
	+ andi $at, BYTES, MASK_U32
	+
	+ /* Load upper half of jump table addr */
	+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
	+
	+ /* Calculate lower half jump table offset */
	+ ins T0, $at, 1, 6
	+
	+ /* Add offset to STATE */
	+ addu T1, STATE, $at
	+
	+ /* Add lower half jump table addr */
	+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
	+
	+ /* Read value from STATE */
	+ lw SAVED_CA, 0(T1)
	+
	+ /* Store remaining bytecounter as negative value */
	+ subu BYTES, $at, BYTES
	+
	+ jr T0
	+
	+ /* Jump table */
	+ FOR_EACH_WORD(JMPTBL_UNALIGNED)
	+.end chacha20_mips
	+.set at