| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| Date: Fri, 8 Nov 2019 13:22:38 +0100 |
| Subject: [PATCH] crypto: arm/curve25519 - wire up NEON implementation |
| |
| commit d8f1308a025fc7e00414194ed742d5f05a21e13c upstream. |
| |
| This ports the SUPERCOP implementation for usage in kernel space. In |
| addition to the usual header, macro, and style changes required for |
| kernel space, it makes a few small changes to the code: |
| |
| - The stack alignment is relaxed to 16 bytes. |
| - Superfluous mov statements have been removed. |
| - ldr for constants has been replaced with movw. |
| - ldreq has been replaced with moveq. |
| - The str epilogue has been made more idiomatic. |
| - SIMD registers are not pushed and popped at the beginning and end. |
| - The prologue and epilogue have been made idiomatic. |
| - A hole has been removed from the stack, saving 32 bytes. |
| - We write-back the base register whenever possible for vld1.8. |
| - Some multiplications have been reordered for better A7 performance. |
| |
| There are more opportunities for cleanup, since this code is from qhasm, |
| which doesn't always do the most opportune thing. But even prior to |
| extensive hand optimizations, this code delivers significant performance |
| improvements (given in get_cycles() per call): |
| |
| ----------- ------------- |
| | generic C | this commit | |
| ------------ ----------- ------------- |
| | Cortex-A7 | 49136 | 22395 | |
| ------------ ----------- ------------- |
| | Cortex-A17 | 17326 | 4983 | |
| ------------ ----------- ------------- |
| |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| [ardb: - move to arch/arm/crypto |
| - wire into lib/crypto framework |
| - implement crypto API KPP hooks ] |
| Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| --- |
| arch/arm/crypto/Kconfig | 6 + |
| arch/arm/crypto/Makefile | 2 + |
| arch/arm/crypto/curve25519-core.S | 347 +++++++++++++----------------- |
| arch/arm/crypto/curve25519-glue.c | 127 +++++++++++ |
| 4 files changed, 287 insertions(+), 195 deletions(-) |
| create mode 100644 arch/arm/crypto/curve25519-glue.c |
| |
| --- a/arch/arm/crypto/Kconfig |
| +++ b/arch/arm/crypto/Kconfig |
| @@ -143,4 +143,10 @@ config CRYPTO_NHPOLY1305_NEON |
| depends on KERNEL_MODE_NEON |
| select CRYPTO_NHPOLY1305 |
| |
| +config CRYPTO_CURVE25519_NEON |
| + tristate "NEON accelerated Curve25519 scalar multiplication library" |
| + depends on KERNEL_MODE_NEON |
| + select CRYPTO_LIB_CURVE25519_GENERIC |
| + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 |
| + |
| endif |
| --- a/arch/arm/crypto/Makefile |
| +++ b/arch/arm/crypto/Makefile |
| @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha51 |
| obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o |
| obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o |
| obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o |
| +obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o |
| |
| obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o |
| obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o |
| @@ -38,6 +39,7 @@ chacha-neon-y := chacha-scalar-core.o ch |
| chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o |
| poly1305-arm-y := poly1305-core.o poly1305-glue.o |
| nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o |
| +curve25519-neon-y := curve25519-core.o curve25519-glue.o |
| |
| ifdef REGENERATE_ARM_CRYPTO |
| quiet_cmd_perl = PERL $@ |
| --- a/arch/arm/crypto/curve25519-core.S |
| +++ b/arch/arm/crypto/curve25519-core.S |
| @@ -1,43 +1,35 @@ |
| +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ |
| /* |
| - * Public domain code from Daniel J. Bernstein and Peter Schwabe, from |
| - * SUPERCOP's curve25519/neon2/scalarmult.s. |
| + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| + * |
| + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This |
| + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been |
| + * manually reworked for use in kernel space. |
| */ |
| |
| -.fpu neon |
| +#include <linux/linkage.h> |
| + |
| .text |
| +.fpu neon |
| +.arch armv7-a |
| .align 4 |
| -.global _crypto_scalarmult_curve25519_neon2 |
| -.global crypto_scalarmult_curve25519_neon2 |
| -.type _crypto_scalarmult_curve25519_neon2 STT_FUNC |
| -.type crypto_scalarmult_curve25519_neon2 STT_FUNC |
| - _crypto_scalarmult_curve25519_neon2: |
| - crypto_scalarmult_curve25519_neon2: |
| - vpush {q4, q5, q6, q7} |
| - mov r12, sp |
| - sub sp, sp, #736 |
| - and sp, sp, #0xffffffe0 |
| - strd r4, [sp, #0] |
| - strd r6, [sp, #8] |
| - strd r8, [sp, #16] |
| - strd r10, [sp, #24] |
| - str r12, [sp, #480] |
| - str r14, [sp, #484] |
| - mov r0, r0 |
| - mov r1, r1 |
| - mov r2, r2 |
| - add r3, sp, #32 |
| - ldr r4, =0 |
| - ldr r5, =254 |
| + |
| +ENTRY(curve25519_neon) |
| + push {r4-r11, lr} |
| + mov ip, sp |
| + sub r3, sp, #704 |
| + and r3, r3, #0xfffffff0 |
| + mov sp, r3 |
| + movw r4, #0 |
| + movw r5, #254 |
| vmov.i32 q0, #1 |
| vshr.u64 q1, q0, #7 |
| vshr.u64 q0, q0, #8 |
| vmov.i32 d4, #19 |
| vmov.i32 d5, #38 |
| - add r6, sp, #512 |
| - vst1.8 {d2-d3}, [r6, : 128] |
| - add r6, sp, #528 |
| - vst1.8 {d0-d1}, [r6, : 128] |
| - add r6, sp, #544 |
| + add r6, sp, #480 |
| + vst1.8 {d2-d3}, [r6, : 128]! |
| + vst1.8 {d0-d1}, [r6, : 128]! |
| vst1.8 {d4-d5}, [r6, : 128] |
| add r6, r3, #0 |
| vmov.i32 q2, #0 |
| @@ -45,12 +37,12 @@ |
| vst1.8 {d4-d5}, [r6, : 128]! |
| vst1.8 d4, [r6, : 64] |
| add r6, r3, #0 |
| - ldr r7, =960 |
| + movw r7, #960 |
| sub r7, r7, #2 |
| neg r7, r7 |
| sub r7, r7, r7, LSL #7 |
| str r7, [r6] |
| - add r6, sp, #704 |
| + add r6, sp, #672 |
| vld1.8 {d4-d5}, [r1]! |
| vld1.8 {d6-d7}, [r1] |
| vst1.8 {d4-d5}, [r6, : 128]! |
| @@ -212,15 +204,15 @@ |
| vst1.8 {d0-d1}, [r6, : 128]! |
| vst1.8 {d2-d3}, [r6, : 128]! |
| vst1.8 d4, [r6, : 64] |
| -._mainloop: |
| +.Lmainloop: |
| mov r2, r5, LSR #3 |
| and r6, r5, #7 |
| ldrb r2, [r1, r2] |
| mov r2, r2, LSR r6 |
| and r2, r2, #1 |
| - str r5, [sp, #488] |
| + str r5, [sp, #456] |
| eor r4, r4, r2 |
| - str r2, [sp, #492] |
| + str r2, [sp, #460] |
| neg r2, r4 |
| add r4, r3, #96 |
| add r5, r3, #192 |
| @@ -291,7 +283,7 @@ |
| vsub.i32 q0, q1, q3 |
| vst1.8 d4, [r4, : 64] |
| vst1.8 d0, [r6, : 64] |
| - add r2, sp, #544 |
| + add r2, sp, #512 |
| add r4, r3, #96 |
| add r5, r3, #144 |
| vld1.8 {d0-d1}, [r2, : 128] |
| @@ -361,14 +353,13 @@ |
| vmlal.s32 q0, d12, d8 |
| vmlal.s32 q0, d13, d17 |
| vmlal.s32 q0, d6, d6 |
| - add r2, sp, #512 |
| - vld1.8 {d18-d19}, [r2, : 128] |
| + add r2, sp, #480 |
| + vld1.8 {d18-d19}, [r2, : 128]! |
| vmull.s32 q3, d16, d7 |
| vmlal.s32 q3, d10, d15 |
| vmlal.s32 q3, d11, d14 |
| vmlal.s32 q3, d12, d9 |
| vmlal.s32 q3, d13, d8 |
| - add r2, sp, #528 |
| vld1.8 {d8-d9}, [r2, : 128] |
| vadd.i64 q5, q12, q9 |
| vadd.i64 q6, q15, q9 |
| @@ -502,22 +493,19 @@ |
| vadd.i32 q5, q5, q0 |
| vtrn.32 q11, q14 |
| vadd.i32 q6, q6, q3 |
| - add r2, sp, #560 |
| + add r2, sp, #528 |
| vadd.i32 q10, q10, q2 |
| vtrn.32 d24, d25 |
| - vst1.8 {d12-d13}, [r2, : 128] |
| + vst1.8 {d12-d13}, [r2, : 128]! |
| vshl.i32 q6, q13, #1 |
| - add r2, sp, #576 |
| - vst1.8 {d20-d21}, [r2, : 128] |
| + vst1.8 {d20-d21}, [r2, : 128]! |
| vshl.i32 q10, q14, #1 |
| - add r2, sp, #592 |
| - vst1.8 {d12-d13}, [r2, : 128] |
| + vst1.8 {d12-d13}, [r2, : 128]! |
| vshl.i32 q15, q12, #1 |
| vadd.i32 q8, q8, q4 |
| vext.32 d10, d31, d30, #0 |
| vadd.i32 q7, q7, q1 |
| - add r2, sp, #608 |
| - vst1.8 {d16-d17}, [r2, : 128] |
| + vst1.8 {d16-d17}, [r2, : 128]! |
| vmull.s32 q8, d18, d5 |
| vmlal.s32 q8, d26, d4 |
| vmlal.s32 q8, d19, d9 |
| @@ -528,8 +516,7 @@ |
| vmlal.s32 q8, d29, d1 |
| vmlal.s32 q8, d24, d6 |
| vmlal.s32 q8, d25, d0 |
| - add r2, sp, #624 |
| - vst1.8 {d14-d15}, [r2, : 128] |
| + vst1.8 {d14-d15}, [r2, : 128]! |
| vmull.s32 q2, d18, d4 |
| vmlal.s32 q2, d12, d9 |
| vmlal.s32 q2, d13, d8 |
| @@ -537,8 +524,7 @@ |
| vmlal.s32 q2, d22, d2 |
| vmlal.s32 q2, d23, d1 |
| vmlal.s32 q2, d24, d0 |
| - add r2, sp, #640 |
| - vst1.8 {d20-d21}, [r2, : 128] |
| + vst1.8 {d20-d21}, [r2, : 128]! |
| vmull.s32 q7, d18, d9 |
| vmlal.s32 q7, d26, d3 |
| vmlal.s32 q7, d19, d8 |
| @@ -547,14 +533,12 @@ |
| vmlal.s32 q7, d28, d1 |
| vmlal.s32 q7, d23, d6 |
| vmlal.s32 q7, d29, d0 |
| - add r2, sp, #656 |
| - vst1.8 {d10-d11}, [r2, : 128] |
| + vst1.8 {d10-d11}, [r2, : 128]! |
| vmull.s32 q5, d18, d3 |
| vmlal.s32 q5, d19, d2 |
| vmlal.s32 q5, d22, d1 |
| vmlal.s32 q5, d23, d0 |
| vmlal.s32 q5, d12, d8 |
| - add r2, sp, #672 |
| vst1.8 {d16-d17}, [r2, : 128] |
| vmull.s32 q4, d18, d8 |
| vmlal.s32 q4, d26, d2 |
| @@ -566,7 +550,7 @@ |
| vmlal.s32 q8, d26, d1 |
| vmlal.s32 q8, d19, d6 |
| vmlal.s32 q8, d27, d0 |
| - add r2, sp, #576 |
| + add r2, sp, #544 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vmlal.s32 q7, d24, d21 |
| vmlal.s32 q7, d25, d20 |
| @@ -575,32 +559,30 @@ |
| vmlal.s32 q8, d22, d21 |
| vmlal.s32 q8, d28, d20 |
| vmlal.s32 q5, d24, d20 |
| - add r2, sp, #576 |
| vst1.8 {d14-d15}, [r2, : 128] |
| vmull.s32 q7, d18, d6 |
| vmlal.s32 q7, d26, d0 |
| - add r2, sp, #656 |
| + add r2, sp, #624 |
| vld1.8 {d30-d31}, [r2, : 128] |
| vmlal.s32 q2, d30, d21 |
| vmlal.s32 q7, d19, d21 |
| vmlal.s32 q7, d27, d20 |
| - add r2, sp, #624 |
| + add r2, sp, #592 |
| vld1.8 {d26-d27}, [r2, : 128] |
| vmlal.s32 q4, d25, d27 |
| vmlal.s32 q8, d29, d27 |
| vmlal.s32 q8, d25, d26 |
| vmlal.s32 q7, d28, d27 |
| vmlal.s32 q7, d29, d26 |
| - add r2, sp, #608 |
| + add r2, sp, #576 |
| vld1.8 {d28-d29}, [r2, : 128] |
| vmlal.s32 q4, d24, d29 |
| vmlal.s32 q8, d23, d29 |
| vmlal.s32 q8, d24, d28 |
| vmlal.s32 q7, d22, d29 |
| vmlal.s32 q7, d23, d28 |
| - add r2, sp, #608 |
| vst1.8 {d8-d9}, [r2, : 128] |
| - add r2, sp, #560 |
| + add r2, sp, #528 |
| vld1.8 {d8-d9}, [r2, : 128] |
| vmlal.s32 q7, d24, d9 |
| vmlal.s32 q7, d25, d31 |
| @@ -621,36 +603,36 @@ |
| vmlal.s32 q0, d23, d26 |
| vmlal.s32 q0, d24, d31 |
| vmlal.s32 q0, d19, d20 |
| - add r2, sp, #640 |
| + add r2, sp, #608 |
| vld1.8 {d18-d19}, [r2, : 128] |
| vmlal.s32 q2, d18, d7 |
| - vmlal.s32 q2, d19, d6 |
| vmlal.s32 q5, d18, d6 |
| - vmlal.s32 q5, d19, d21 |
| vmlal.s32 q1, d18, d21 |
| - vmlal.s32 q1, d19, d29 |
| vmlal.s32 q0, d18, d28 |
| - vmlal.s32 q0, d19, d9 |
| vmlal.s32 q6, d18, d29 |
| + vmlal.s32 q2, d19, d6 |
| + vmlal.s32 q5, d19, d21 |
| + vmlal.s32 q1, d19, d29 |
| + vmlal.s32 q0, d19, d9 |
| vmlal.s32 q6, d19, d28 |
| - add r2, sp, #592 |
| + add r2, sp, #560 |
| vld1.8 {d18-d19}, [r2, : 128] |
| - add r2, sp, #512 |
| + add r2, sp, #480 |
| vld1.8 {d22-d23}, [r2, : 128] |
| vmlal.s32 q5, d19, d7 |
| vmlal.s32 q0, d18, d21 |
| vmlal.s32 q0, d19, d29 |
| vmlal.s32 q6, d18, d6 |
| - add r2, sp, #528 |
| + add r2, sp, #496 |
| vld1.8 {d6-d7}, [r2, : 128] |
| vmlal.s32 q6, d19, d21 |
| - add r2, sp, #576 |
| + add r2, sp, #544 |
| vld1.8 {d18-d19}, [r2, : 128] |
| vmlal.s32 q0, d30, d8 |
| - add r2, sp, #672 |
| + add r2, sp, #640 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vmlal.s32 q5, d30, d29 |
| - add r2, sp, #608 |
| + add r2, sp, #576 |
| vld1.8 {d24-d25}, [r2, : 128] |
| vmlal.s32 q1, d30, d28 |
| vadd.i64 q13, q0, q11 |
| @@ -823,22 +805,19 @@ |
| vadd.i32 q5, q5, q0 |
| vtrn.32 q11, q14 |
| vadd.i32 q6, q6, q3 |
| - add r2, sp, #560 |
| + add r2, sp, #528 |
| vadd.i32 q10, q10, q2 |
| vtrn.32 d24, d25 |
| - vst1.8 {d12-d13}, [r2, : 128] |
| + vst1.8 {d12-d13}, [r2, : 128]! |
| vshl.i32 q6, q13, #1 |
| - add r2, sp, #576 |
| - vst1.8 {d20-d21}, [r2, : 128] |
| + vst1.8 {d20-d21}, [r2, : 128]! |
| vshl.i32 q10, q14, #1 |
| - add r2, sp, #592 |
| - vst1.8 {d12-d13}, [r2, : 128] |
| + vst1.8 {d12-d13}, [r2, : 128]! |
| vshl.i32 q15, q12, #1 |
| vadd.i32 q8, q8, q4 |
| vext.32 d10, d31, d30, #0 |
| vadd.i32 q7, q7, q1 |
| - add r2, sp, #608 |
| - vst1.8 {d16-d17}, [r2, : 128] |
| + vst1.8 {d16-d17}, [r2, : 128]! |
| vmull.s32 q8, d18, d5 |
| vmlal.s32 q8, d26, d4 |
| vmlal.s32 q8, d19, d9 |
| @@ -849,8 +828,7 @@ |
| vmlal.s32 q8, d29, d1 |
| vmlal.s32 q8, d24, d6 |
| vmlal.s32 q8, d25, d0 |
| - add r2, sp, #624 |
| - vst1.8 {d14-d15}, [r2, : 128] |
| + vst1.8 {d14-d15}, [r2, : 128]! |
| vmull.s32 q2, d18, d4 |
| vmlal.s32 q2, d12, d9 |
| vmlal.s32 q2, d13, d8 |
| @@ -858,8 +836,7 @@ |
| vmlal.s32 q2, d22, d2 |
| vmlal.s32 q2, d23, d1 |
| vmlal.s32 q2, d24, d0 |
| - add r2, sp, #640 |
| - vst1.8 {d20-d21}, [r2, : 128] |
| + vst1.8 {d20-d21}, [r2, : 128]! |
| vmull.s32 q7, d18, d9 |
| vmlal.s32 q7, d26, d3 |
| vmlal.s32 q7, d19, d8 |
| @@ -868,15 +845,13 @@ |
| vmlal.s32 q7, d28, d1 |
| vmlal.s32 q7, d23, d6 |
| vmlal.s32 q7, d29, d0 |
| - add r2, sp, #656 |
| - vst1.8 {d10-d11}, [r2, : 128] |
| + vst1.8 {d10-d11}, [r2, : 128]! |
| vmull.s32 q5, d18, d3 |
| vmlal.s32 q5, d19, d2 |
| vmlal.s32 q5, d22, d1 |
| vmlal.s32 q5, d23, d0 |
| vmlal.s32 q5, d12, d8 |
| - add r2, sp, #672 |
| - vst1.8 {d16-d17}, [r2, : 128] |
| + vst1.8 {d16-d17}, [r2, : 128]! |
| vmull.s32 q4, d18, d8 |
| vmlal.s32 q4, d26, d2 |
| vmlal.s32 q4, d19, d7 |
| @@ -887,7 +862,7 @@ |
| vmlal.s32 q8, d26, d1 |
| vmlal.s32 q8, d19, d6 |
| vmlal.s32 q8, d27, d0 |
| - add r2, sp, #576 |
| + add r2, sp, #544 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vmlal.s32 q7, d24, d21 |
| vmlal.s32 q7, d25, d20 |
| @@ -896,32 +871,30 @@ |
| vmlal.s32 q8, d22, d21 |
| vmlal.s32 q8, d28, d20 |
| vmlal.s32 q5, d24, d20 |
| - add r2, sp, #576 |
| vst1.8 {d14-d15}, [r2, : 128] |
| vmull.s32 q7, d18, d6 |
| vmlal.s32 q7, d26, d0 |
| - add r2, sp, #656 |
| + add r2, sp, #624 |
| vld1.8 {d30-d31}, [r2, : 128] |
| vmlal.s32 q2, d30, d21 |
| vmlal.s32 q7, d19, d21 |
| vmlal.s32 q7, d27, d20 |
| - add r2, sp, #624 |
| + add r2, sp, #592 |
| vld1.8 {d26-d27}, [r2, : 128] |
| vmlal.s32 q4, d25, d27 |
| vmlal.s32 q8, d29, d27 |
| vmlal.s32 q8, d25, d26 |
| vmlal.s32 q7, d28, d27 |
| vmlal.s32 q7, d29, d26 |
| - add r2, sp, #608 |
| + add r2, sp, #576 |
| vld1.8 {d28-d29}, [r2, : 128] |
| vmlal.s32 q4, d24, d29 |
| vmlal.s32 q8, d23, d29 |
| vmlal.s32 q8, d24, d28 |
| vmlal.s32 q7, d22, d29 |
| vmlal.s32 q7, d23, d28 |
| - add r2, sp, #608 |
| vst1.8 {d8-d9}, [r2, : 128] |
| - add r2, sp, #560 |
| + add r2, sp, #528 |
| vld1.8 {d8-d9}, [r2, : 128] |
| vmlal.s32 q7, d24, d9 |
| vmlal.s32 q7, d25, d31 |
| @@ -942,36 +915,36 @@ |
| vmlal.s32 q0, d23, d26 |
| vmlal.s32 q0, d24, d31 |
| vmlal.s32 q0, d19, d20 |
| - add r2, sp, #640 |
| + add r2, sp, #608 |
| vld1.8 {d18-d19}, [r2, : 128] |
| vmlal.s32 q2, d18, d7 |
| - vmlal.s32 q2, d19, d6 |
| vmlal.s32 q5, d18, d6 |
| - vmlal.s32 q5, d19, d21 |
| vmlal.s32 q1, d18, d21 |
| - vmlal.s32 q1, d19, d29 |
| vmlal.s32 q0, d18, d28 |
| - vmlal.s32 q0, d19, d9 |
| vmlal.s32 q6, d18, d29 |
| + vmlal.s32 q2, d19, d6 |
| + vmlal.s32 q5, d19, d21 |
| + vmlal.s32 q1, d19, d29 |
| + vmlal.s32 q0, d19, d9 |
| vmlal.s32 q6, d19, d28 |
| - add r2, sp, #592 |
| + add r2, sp, #560 |
| vld1.8 {d18-d19}, [r2, : 128] |
| - add r2, sp, #512 |
| + add r2, sp, #480 |
| vld1.8 {d22-d23}, [r2, : 128] |
| vmlal.s32 q5, d19, d7 |
| vmlal.s32 q0, d18, d21 |
| vmlal.s32 q0, d19, d29 |
| vmlal.s32 q6, d18, d6 |
| - add r2, sp, #528 |
| + add r2, sp, #496 |
| vld1.8 {d6-d7}, [r2, : 128] |
| vmlal.s32 q6, d19, d21 |
| - add r2, sp, #576 |
| + add r2, sp, #544 |
| vld1.8 {d18-d19}, [r2, : 128] |
| vmlal.s32 q0, d30, d8 |
| - add r2, sp, #672 |
| + add r2, sp, #640 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vmlal.s32 q5, d30, d29 |
| - add r2, sp, #608 |
| + add r2, sp, #576 |
| vld1.8 {d24-d25}, [r2, : 128] |
| vmlal.s32 q1, d30, d28 |
| vadd.i64 q13, q0, q11 |
| @@ -1069,7 +1042,7 @@ |
| sub r4, r4, #24 |
| vst1.8 d0, [r2, : 64] |
| vst1.8 d1, [r4, : 64] |
| - add r2, sp, #544 |
| + add r2, sp, #512 |
| add r4, r3, #144 |
| add r5, r3, #192 |
| vld1.8 {d0-d1}, [r2, : 128] |
| @@ -1139,14 +1112,13 @@ |
| vmlal.s32 q0, d12, d8 |
| vmlal.s32 q0, d13, d17 |
| vmlal.s32 q0, d6, d6 |
| - add r2, sp, #512 |
| - vld1.8 {d18-d19}, [r2, : 128] |
| + add r2, sp, #480 |
| + vld1.8 {d18-d19}, [r2, : 128]! |
| vmull.s32 q3, d16, d7 |
| vmlal.s32 q3, d10, d15 |
| vmlal.s32 q3, d11, d14 |
| vmlal.s32 q3, d12, d9 |
| vmlal.s32 q3, d13, d8 |
| - add r2, sp, #528 |
| vld1.8 {d8-d9}, [r2, : 128] |
| vadd.i64 q5, q12, q9 |
| vadd.i64 q6, q15, q9 |
| @@ -1295,22 +1267,19 @@ |
| vadd.i32 q5, q5, q0 |
| vtrn.32 q11, q14 |
| vadd.i32 q6, q6, q3 |
| - add r2, sp, #560 |
| + add r2, sp, #528 |
| vadd.i32 q10, q10, q2 |
| vtrn.32 d24, d25 |
| - vst1.8 {d12-d13}, [r2, : 128] |
| + vst1.8 {d12-d13}, [r2, : 128]! |
| vshl.i32 q6, q13, #1 |
| - add r2, sp, #576 |
| - vst1.8 {d20-d21}, [r2, : 128] |
| + vst1.8 {d20-d21}, [r2, : 128]! |
| vshl.i32 q10, q14, #1 |
| - add r2, sp, #592 |
| - vst1.8 {d12-d13}, [r2, : 128] |
| + vst1.8 {d12-d13}, [r2, : 128]! |
| vshl.i32 q15, q12, #1 |
| vadd.i32 q8, q8, q4 |
| vext.32 d10, d31, d30, #0 |
| vadd.i32 q7, q7, q1 |
| - add r2, sp, #608 |
| - vst1.8 {d16-d17}, [r2, : 128] |
| + vst1.8 {d16-d17}, [r2, : 128]! |
| vmull.s32 q8, d18, d5 |
| vmlal.s32 q8, d26, d4 |
| vmlal.s32 q8, d19, d9 |
| @@ -1321,8 +1290,7 @@ |
| vmlal.s32 q8, d29, d1 |
| vmlal.s32 q8, d24, d6 |
| vmlal.s32 q8, d25, d0 |
| - add r2, sp, #624 |
| - vst1.8 {d14-d15}, [r2, : 128] |
| + vst1.8 {d14-d15}, [r2, : 128]! |
| vmull.s32 q2, d18, d4 |
| vmlal.s32 q2, d12, d9 |
| vmlal.s32 q2, d13, d8 |
| @@ -1330,8 +1298,7 @@ |
| vmlal.s32 q2, d22, d2 |
| vmlal.s32 q2, d23, d1 |
| vmlal.s32 q2, d24, d0 |
| - add r2, sp, #640 |
| - vst1.8 {d20-d21}, [r2, : 128] |
| + vst1.8 {d20-d21}, [r2, : 128]! |
| vmull.s32 q7, d18, d9 |
| vmlal.s32 q7, d26, d3 |
| vmlal.s32 q7, d19, d8 |
| @@ -1340,15 +1307,13 @@ |
| vmlal.s32 q7, d28, d1 |
| vmlal.s32 q7, d23, d6 |
| vmlal.s32 q7, d29, d0 |
| - add r2, sp, #656 |
| - vst1.8 {d10-d11}, [r2, : 128] |
| + vst1.8 {d10-d11}, [r2, : 128]! |
| vmull.s32 q5, d18, d3 |
| vmlal.s32 q5, d19, d2 |
| vmlal.s32 q5, d22, d1 |
| vmlal.s32 q5, d23, d0 |
| vmlal.s32 q5, d12, d8 |
| - add r2, sp, #672 |
| - vst1.8 {d16-d17}, [r2, : 128] |
| + vst1.8 {d16-d17}, [r2, : 128]! |
| vmull.s32 q4, d18, d8 |
| vmlal.s32 q4, d26, d2 |
| vmlal.s32 q4, d19, d7 |
| @@ -1359,7 +1324,7 @@ |
| vmlal.s32 q8, d26, d1 |
| vmlal.s32 q8, d19, d6 |
| vmlal.s32 q8, d27, d0 |
| - add r2, sp, #576 |
| + add r2, sp, #544 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vmlal.s32 q7, d24, d21 |
| vmlal.s32 q7, d25, d20 |
| @@ -1368,32 +1333,30 @@ |
| vmlal.s32 q8, d22, d21 |
| vmlal.s32 q8, d28, d20 |
| vmlal.s32 q5, d24, d20 |
| - add r2, sp, #576 |
| vst1.8 {d14-d15}, [r2, : 128] |
| vmull.s32 q7, d18, d6 |
| vmlal.s32 q7, d26, d0 |
| - add r2, sp, #656 |
| + add r2, sp, #624 |
| vld1.8 {d30-d31}, [r2, : 128] |
| vmlal.s32 q2, d30, d21 |
| vmlal.s32 q7, d19, d21 |
| vmlal.s32 q7, d27, d20 |
| - add r2, sp, #624 |
| + add r2, sp, #592 |
| vld1.8 {d26-d27}, [r2, : 128] |
| vmlal.s32 q4, d25, d27 |
| vmlal.s32 q8, d29, d27 |
| vmlal.s32 q8, d25, d26 |
| vmlal.s32 q7, d28, d27 |
| vmlal.s32 q7, d29, d26 |
| - add r2, sp, #608 |
| + add r2, sp, #576 |
| vld1.8 {d28-d29}, [r2, : 128] |
| vmlal.s32 q4, d24, d29 |
| vmlal.s32 q8, d23, d29 |
| vmlal.s32 q8, d24, d28 |
| vmlal.s32 q7, d22, d29 |
| vmlal.s32 q7, d23, d28 |
| - add r2, sp, #608 |
| vst1.8 {d8-d9}, [r2, : 128] |
| - add r2, sp, #560 |
| + add r2, sp, #528 |
| vld1.8 {d8-d9}, [r2, : 128] |
| vmlal.s32 q7, d24, d9 |
| vmlal.s32 q7, d25, d31 |
| @@ -1414,36 +1377,36 @@ |
| vmlal.s32 q0, d23, d26 |
| vmlal.s32 q0, d24, d31 |
| vmlal.s32 q0, d19, d20 |
| - add r2, sp, #640 |
| + add r2, sp, #608 |
| vld1.8 {d18-d19}, [r2, : 128] |
| vmlal.s32 q2, d18, d7 |
| - vmlal.s32 q2, d19, d6 |
| vmlal.s32 q5, d18, d6 |
| - vmlal.s32 q5, d19, d21 |
| vmlal.s32 q1, d18, d21 |
| - vmlal.s32 q1, d19, d29 |
| vmlal.s32 q0, d18, d28 |
| - vmlal.s32 q0, d19, d9 |
| vmlal.s32 q6, d18, d29 |
| + vmlal.s32 q2, d19, d6 |
| + vmlal.s32 q5, d19, d21 |
| + vmlal.s32 q1, d19, d29 |
| + vmlal.s32 q0, d19, d9 |
| vmlal.s32 q6, d19, d28 |
| - add r2, sp, #592 |
| + add r2, sp, #560 |
| vld1.8 {d18-d19}, [r2, : 128] |
| - add r2, sp, #512 |
| + add r2, sp, #480 |
| vld1.8 {d22-d23}, [r2, : 128] |
| vmlal.s32 q5, d19, d7 |
| vmlal.s32 q0, d18, d21 |
| vmlal.s32 q0, d19, d29 |
| vmlal.s32 q6, d18, d6 |
| - add r2, sp, #528 |
| + add r2, sp, #496 |
| vld1.8 {d6-d7}, [r2, : 128] |
| vmlal.s32 q6, d19, d21 |
| - add r2, sp, #576 |
| + add r2, sp, #544 |
| vld1.8 {d18-d19}, [r2, : 128] |
| vmlal.s32 q0, d30, d8 |
| - add r2, sp, #672 |
| + add r2, sp, #640 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vmlal.s32 q5, d30, d29 |
| - add r2, sp, #608 |
| + add r2, sp, #576 |
| vld1.8 {d24-d25}, [r2, : 128] |
| vmlal.s32 q1, d30, d28 |
| vadd.i64 q13, q0, q11 |
| @@ -1541,10 +1504,10 @@ |
| sub r4, r4, #24 |
| vst1.8 d0, [r2, : 64] |
| vst1.8 d1, [r4, : 64] |
| - ldr r2, [sp, #488] |
| - ldr r4, [sp, #492] |
| + ldr r2, [sp, #456] |
| + ldr r4, [sp, #460] |
| subs r5, r2, #1 |
| - bge ._mainloop |
| + bge .Lmainloop |
| add r1, r3, #144 |
| add r2, r3, #336 |
| vld1.8 {d0-d1}, [r1, : 128]! |
| @@ -1553,41 +1516,41 @@ |
| vst1.8 {d0-d1}, [r2, : 128]! |
| vst1.8 {d2-d3}, [r2, : 128]! |
| vst1.8 d4, [r2, : 64] |
| - ldr r1, =0 |
| -._invertloop: |
| + movw r1, #0 |
| +.Linvertloop: |
| add r2, r3, #144 |
| - ldr r4, =0 |
| - ldr r5, =2 |
| + movw r4, #0 |
| + movw r5, #2 |
| cmp r1, #1 |
| - ldreq r5, =1 |
| + moveq r5, #1 |
| addeq r2, r3, #336 |
| addeq r4, r3, #48 |
| cmp r1, #2 |
| - ldreq r5, =1 |
| + moveq r5, #1 |
| addeq r2, r3, #48 |
| cmp r1, #3 |
| - ldreq r5, =5 |
| + moveq r5, #5 |
| addeq r4, r3, #336 |
| cmp r1, #4 |
| - ldreq r5, =10 |
| + moveq r5, #10 |
| cmp r1, #5 |
| - ldreq r5, =20 |
| + moveq r5, #20 |
| cmp r1, #6 |
| - ldreq r5, =10 |
| + moveq r5, #10 |
| addeq r2, r3, #336 |
| addeq r4, r3, #336 |
| cmp r1, #7 |
| - ldreq r5, =50 |
| + moveq r5, #50 |
| cmp r1, #8 |
| - ldreq r5, =100 |
| + moveq r5, #100 |
| cmp r1, #9 |
| - ldreq r5, =50 |
| + moveq r5, #50 |
| addeq r2, r3, #336 |
| cmp r1, #10 |
| - ldreq r5, =5 |
| + moveq r5, #5 |
| addeq r2, r3, #48 |
| cmp r1, #11 |
| - ldreq r5, =0 |
| + moveq r5, #0 |
| addeq r2, r3, #96 |
| add r6, r3, #144 |
| add r7, r3, #288 |
| @@ -1598,8 +1561,8 @@ |
| vst1.8 {d2-d3}, [r7, : 128]! |
| vst1.8 d4, [r7, : 64] |
| cmp r5, #0 |
| - beq ._skipsquaringloop |
| -._squaringloop: |
| + beq .Lskipsquaringloop |
| +.Lsquaringloop: |
| add r6, r3, #288 |
| add r7, r3, #288 |
| add r8, r3, #288 |
| @@ -1611,7 +1574,7 @@ |
| vld1.8 {d6-d7}, [r7, : 128]! |
| vld1.8 {d9}, [r7, : 64] |
| vld1.8 {d10-d11}, [r6, : 128]! |
| - add r7, sp, #416 |
| + add r7, sp, #384 |
| vld1.8 {d12-d13}, [r6, : 128]! |
| vmul.i32 q7, q2, q0 |
| vld1.8 {d8}, [r6, : 64] |
| @@ -1726,7 +1689,7 @@ |
| vext.32 d10, d6, d6, #0 |
| vmov.i32 q1, #0xffffffff |
| vshl.i64 q4, q1, #25 |
| - add r7, sp, #512 |
| + add r7, sp, #480 |
| vld1.8 {d14-d15}, [r7, : 128] |
| vadd.i64 q9, q2, q7 |
| vshl.i64 q1, q1, #26 |
| @@ -1735,7 +1698,7 @@ |
| vadd.i64 q5, q5, q10 |
| vand q9, q9, q1 |
| vld1.8 {d16}, [r6, : 64]! |
| - add r6, sp, #528 |
| + add r6, sp, #496 |
| vld1.8 {d20-d21}, [r6, : 128] |
| vadd.i64 q11, q5, q10 |
| vsub.i64 q2, q2, q9 |
| @@ -1789,8 +1752,8 @@ |
| sub r6, r6, #32 |
| vst1.8 d4, [r6, : 64] |
| subs r5, r5, #1 |
| - bhi ._squaringloop |
| -._skipsquaringloop: |
| + bhi .Lsquaringloop |
| +.Lskipsquaringloop: |
| mov r2, r2 |
| add r5, r3, #288 |
| add r6, r3, #144 |
| @@ -1802,7 +1765,7 @@ |
| vld1.8 {d6-d7}, [r5, : 128]! |
| vld1.8 {d9}, [r5, : 64] |
| vld1.8 {d10-d11}, [r2, : 128]! |
| - add r5, sp, #416 |
| + add r5, sp, #384 |
| vld1.8 {d12-d13}, [r2, : 128]! |
| vmul.i32 q7, q2, q0 |
| vld1.8 {d8}, [r2, : 64] |
| @@ -1917,7 +1880,7 @@ |
| vext.32 d10, d6, d6, #0 |
| vmov.i32 q1, #0xffffffff |
| vshl.i64 q4, q1, #25 |
| - add r5, sp, #512 |
| + add r5, sp, #480 |
| vld1.8 {d14-d15}, [r5, : 128] |
| vadd.i64 q9, q2, q7 |
| vshl.i64 q1, q1, #26 |
| @@ -1926,7 +1889,7 @@ |
| vadd.i64 q5, q5, q10 |
| vand q9, q9, q1 |
| vld1.8 {d16}, [r2, : 64]! |
| - add r2, sp, #528 |
| + add r2, sp, #496 |
| vld1.8 {d20-d21}, [r2, : 128] |
| vadd.i64 q11, q5, q10 |
| vsub.i64 q2, q2, q9 |
| @@ -1980,7 +1943,7 @@ |
| sub r2, r2, #32 |
| vst1.8 d4, [r2, : 64] |
| cmp r4, #0 |
| - beq ._skippostcopy |
| + beq .Lskippostcopy |
| add r2, r3, #144 |
| mov r4, r4 |
| vld1.8 {d0-d1}, [r2, : 128]! |
| @@ -1989,9 +1952,9 @@ |
| vst1.8 {d0-d1}, [r4, : 128]! |
| vst1.8 {d2-d3}, [r4, : 128]! |
| vst1.8 d4, [r4, : 64] |
| -._skippostcopy: |
| +.Lskippostcopy: |
| cmp r1, #1 |
| - bne ._skipfinalcopy |
| + bne .Lskipfinalcopy |
| add r2, r3, #288 |
| add r4, r3, #144 |
| vld1.8 {d0-d1}, [r2, : 128]! |
| @@ -2000,10 +1963,10 @@ |
| vst1.8 {d0-d1}, [r4, : 128]! |
| vst1.8 {d2-d3}, [r4, : 128]! |
| vst1.8 d4, [r4, : 64] |
| -._skipfinalcopy: |
| +.Lskipfinalcopy: |
| add r1, r1, #1 |
| cmp r1, #12 |
| - blo ._invertloop |
| + blo .Linvertloop |
| add r1, r3, #144 |
| ldr r2, [r1], #4 |
| ldr r3, [r1], #4 |
| @@ -2085,21 +2048,15 @@ |
| add r8, r8, r10, LSL #12 |
| mov r9, r10, LSR #20 |
| add r1, r9, r1, LSL #6 |
| - str r2, [r0], #4 |
| - str r3, [r0], #4 |
| - str r4, [r0], #4 |
| - str r5, [r0], #4 |
| - str r6, [r0], #4 |
| - str r7, [r0], #4 |
| - str r8, [r0], #4 |
| - str r1, [r0] |
| - ldrd r4, [sp, #0] |
| - ldrd r6, [sp, #8] |
| - ldrd r8, [sp, #16] |
| - ldrd r10, [sp, #24] |
| - ldr r12, [sp, #480] |
| - ldr r14, [sp, #484] |
| - ldr r0, =0 |
| - mov sp, r12 |
| - vpop {q4, q5, q6, q7} |
| - bx lr |
| + str r2, [r0] |
| + str r3, [r0, #4] |
| + str r4, [r0, #8] |
| + str r5, [r0, #12] |
| + str r6, [r0, #16] |
| + str r7, [r0, #20] |
| + str r8, [r0, #24] |
| + str r1, [r0, #28] |
| + movw r0, #0 |
| + mov sp, ip |
| + pop {r4-r11, pc} |
| +ENDPROC(curve25519_neon) |
| --- /dev/null |
| +++ b/arch/arm/crypto/curve25519-glue.c |
| @@ -0,0 +1,127 @@ |
| +// SPDX-License-Identifier: GPL-2.0 OR MIT |
| +/* |
| + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| + * |
| + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This |
| + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been |
| + * manually reworked for use in kernel space. |
| + */ |
| + |
| +#include <asm/hwcap.h> |
| +#include <asm/neon.h> |
| +#include <asm/simd.h> |
| +#include <crypto/internal/kpp.h> |
| +#include <crypto/internal/simd.h> |
| +#include <linux/types.h> |
| +#include <linux/module.h> |
| +#include <linux/init.h> |
| +#include <linux/jump_label.h> |
| +#include <crypto/curve25519.h> |
| + |
| +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], |
| + const u8 secret[CURVE25519_KEY_SIZE], |
| + const u8 basepoint[CURVE25519_KEY_SIZE]); |
| + |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); |
| + |
| +void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], |
| + const u8 scalar[CURVE25519_KEY_SIZE], |
| + const u8 point[CURVE25519_KEY_SIZE]) |
| +{ |
| + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { |
| + kernel_neon_begin(); |
| + curve25519_neon(out, scalar, point); |
| + kernel_neon_end(); |
| + } else { |
| + curve25519_generic(out, scalar, point); |
| + } |
| +} |
| +EXPORT_SYMBOL(curve25519_arch); |
| + |
| +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, |
| + unsigned int len) |
| +{ |
| + u8 *secret = kpp_tfm_ctx(tfm); |
| + |
| + if (!len) |
| + curve25519_generate_secret(secret); |
| + else if (len == CURVE25519_KEY_SIZE && |
| + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) |
| + memcpy(secret, buf, CURVE25519_KEY_SIZE); |
| + else |
| + return -EINVAL; |
| + return 0; |
| +} |
| + |
| +static int curve25519_compute_value(struct kpp_request *req) |
| +{ |
| + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); |
| + const u8 *secret = kpp_tfm_ctx(tfm); |
| + u8 public_key[CURVE25519_KEY_SIZE]; |
| + u8 buf[CURVE25519_KEY_SIZE]; |
| + int copied, nbytes; |
| + u8 const *bp; |
| + |
| + if (req->src) { |
| + copied = sg_copy_to_buffer(req->src, |
| + sg_nents_for_len(req->src, |
| + CURVE25519_KEY_SIZE), |
| + public_key, CURVE25519_KEY_SIZE); |
| + if (copied != CURVE25519_KEY_SIZE) |
| + return -EINVAL; |
| + bp = public_key; |
| + } else { |
| + bp = curve25519_base_point; |
| + } |
| + |
| + curve25519_arch(buf, secret, bp); |
| + |
| + /* might want less than we've got */ |
| + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); |
| + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, |
| + nbytes), |
| + buf, nbytes); |
| + if (copied != nbytes) |
| + return -EINVAL; |
| + return 0; |
| +} |
| + |
| +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) |
| +{ |
| + return CURVE25519_KEY_SIZE; |
| +} |
| + |
| +static struct kpp_alg curve25519_alg = { |
| + .base.cra_name = "curve25519", |
| + .base.cra_driver_name = "curve25519-neon", |
| + .base.cra_priority = 200, |
| + .base.cra_module = THIS_MODULE, |
| + .base.cra_ctxsize = CURVE25519_KEY_SIZE, |
| + |
| + .set_secret = curve25519_set_secret, |
| + .generate_public_key = curve25519_compute_value, |
| + .compute_shared_secret = curve25519_compute_value, |
| + .max_size = curve25519_max_size, |
| +}; |
| + |
| +static int __init mod_init(void) |
| +{ |
| + if (elf_hwcap & HWCAP_NEON) { |
| + static_branch_enable(&have_neon); |
| + return crypto_register_kpp(&curve25519_alg); |
| + } |
| + return 0; |
| +} |
| + |
| +static void __exit mod_exit(void) |
| +{ |
| + if (elf_hwcap & HWCAP_NEON) |
| + crypto_unregister_kpp(&curve25519_alg); |
| +} |
| + |
| +module_init(mod_init); |
| +module_exit(mod_exit); |
| + |
| +MODULE_ALIAS_CRYPTO("curve25519"); |
| +MODULE_ALIAS_CRYPTO("curve25519-neon"); |
| +MODULE_LICENSE("GPL v2"); |