| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| Date: Sun, 5 Jan 2020 22:40:48 -0500 |
| Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for |
| kernel |
| |
| commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream. |
| |
| These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F. |
| The AVX-512F implementation is disabled on Skylake, due to throttling, |
| but it is quite fast on >= Cannonlake. |
| |
| On the left is cycle counts on a Core i7 6700HQ using the AVX-2 |
| codepath, comparing this implementation ("new") to the implementation in |
| the current crypto api ("old"). On the right are benchmarks on a Xeon |
| Gold 5120 using the AVX-512 codepath. The new implementation is faster |
| on all benchmarks. |
| |
| AVX-2 AVX-512 |
| --------- ----------- |
| |
| size old new size old new |
| ---- ---- ---- ---- ---- ---- |
| 0 70 68 0 74 70 |
| 16 92 90 16 96 92 |
| 32 134 104 32 136 106 |
| 48 172 120 48 184 124 |
| 64 218 136 64 218 138 |
| 80 254 158 80 260 160 |
| 96 298 174 96 300 176 |
| 112 342 192 112 342 194 |
| 128 388 212 128 384 212 |
| 144 428 228 144 420 226 |
| 160 466 246 160 464 248 |
| 176 510 264 176 504 264 |
| 192 550 282 192 544 282 |
| 208 594 302 208 582 300 |
| 224 628 316 224 624 318 |
| 240 676 334 240 662 338 |
| 256 716 354 256 708 358 |
| 272 764 374 272 748 372 |
| 288 802 352 288 788 358 |
| 304 420 366 304 422 370 |
| 320 428 360 320 432 364 |
| 336 484 378 336 486 380 |
| 352 426 384 352 434 390 |
| 368 478 400 368 480 408 |
| 384 488 394 384 490 398 |
| 400 542 408 400 542 412 |
| 416 486 416 416 492 426 |
| 432 534 430 432 538 436 |
| 448 544 422 448 546 432 |
| 464 600 438 464 600 448 |
| 480 540 448 480 548 456 |
| 496 594 464 496 594 476 |
| 512 602 456 512 606 470 |
| 528 656 476 528 656 480 |
| 544 600 480 544 606 498 |
| 560 650 494 560 652 512 |
| 576 664 490 576 662 508 |
| 592 714 508 592 716 522 |
| 608 656 514 608 664 538 |
| 624 708 532 624 710 552 |
| 640 716 524 640 720 516 |
| 656 770 536 656 772 526 |
| 672 716 548 672 722 544 |
| 688 770 562 688 768 556 |
| 704 774 552 704 778 556 |
| 720 826 568 720 832 568 |
| 736 768 574 736 780 584 |
| 752 822 592 752 826 600 |
| 768 830 584 768 836 560 |
| 784 884 602 784 888 572 |
| 800 828 610 800 838 588 |
| 816 884 628 816 884 604 |
| 832 888 618 832 894 598 |
| 848 942 632 848 946 612 |
| 864 884 644 864 896 628 |
| 880 936 660 880 942 644 |
| 896 948 652 896 952 608 |
| 912 1000 664 912 1004 616 |
| 928 942 676 928 954 634 |
| 944 994 690 944 1000 646 |
| 960 1002 680 960 1008 646 |
| 976 1054 694 976 1062 658 |
| 992 1002 706 992 1012 674 |
| 1008 1052 720 1008 1058 690 |
| |
| This commit wires in the prior implementation from Andy, and makes the |
| following changes to be suitable for kernel land. |
| |
| - Some cosmetic and structural changes, like renaming labels to |
| .Lname, constants, and other Linux conventions, as well as making |
| the code easy for us to maintain moving forward. |
| |
| - CPU feature checking is done in C by the glue code. |
| |
| - We avoid jumping into the middle of functions, to appease objtool, |
| and instead parameterize shared code. |
| |
| - We maintain frame pointers so that stack traces make sense. |
| |
| - We remove the dependency on the perl xlate code, which transforms |
| the output into things that assemblers we don't care about use. |
| |
| Importantly, none of our changes affect the arithmetic or core code, but |
| just involve the differing environment of kernel space. |
| |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| Signed-off-by: Samuel Neves <sneves@dei.uc.pt> |
| Co-developed-by: Samuel Neves <sneves@dei.uc.pt> |
| Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| --- |
| arch/x86/crypto/.gitignore | 1 + |
| arch/x86/crypto/Makefile | 11 +- |
| arch/x86/crypto/poly1305-avx2-x86_64.S | 390 ---------- |
| arch/x86/crypto/poly1305-sse2-x86_64.S | 590 --------------- |
| arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++-------- |
| arch/x86/crypto/poly1305_glue.c | 473 +++++------- |
| lib/crypto/Kconfig | 2 +- |
| 7 files changed, 572 insertions(+), 1577 deletions(-) |
| create mode 100644 arch/x86/crypto/.gitignore |
| delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S |
| delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S |
| |
| --- /dev/null |
| +++ b/arch/x86/crypto/.gitignore |
| @@ -0,0 +1 @@ |
| +poly1305-x86_64.S |
| --- a/arch/x86/crypto/Makefile |
| +++ b/arch/x86/crypto/Makefile |
| @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o |
| |
| nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o |
| blake2s-x86_64-y := blake2s-core.o blake2s-glue.o |
| +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o |
| +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),) |
| +targets += poly1305-x86_64-cryptogams.S |
| +endif |
| |
| ifeq ($(avx_supported),yes) |
| camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ |
| @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni |
| aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o |
| ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
| sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o |
| -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o |
| ifeq ($(avx2_supported),yes) |
| sha1-ssse3-y += sha1_avx2_x86_64_asm.o |
| -poly1305-x86_64-y += poly1305-avx2-x86_64.o |
| endif |
| ifeq ($(sha1_ni_supported),yes) |
| sha1-ssse3-y += sha1_ni_asm.o |
| @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o |
| endif |
| sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o |
| crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o |
| + |
| +quiet_cmd_perlasm = PERLASM $@ |
| + cmd_perlasm = $(PERL) $< > $@ |
| +$(obj)/%.S: $(src)/%.pl FORCE |
| + $(call if_changed,perlasm) |
| --- a/arch/x86/crypto/poly1305-avx2-x86_64.S |
| +++ /dev/null |
| @@ -1,390 +0,0 @@ |
| -/* SPDX-License-Identifier: GPL-2.0-or-later */ |
| -/* |
| - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions |
| - * |
| - * Copyright (C) 2015 Martin Willi |
| - */ |
| - |
| -#include <linux/linkage.h> |
| - |
| -.section .rodata.cst32.ANMASK, "aM", @progbits, 32 |
| -.align 32 |
| -ANMASK: .octa 0x0000000003ffffff0000000003ffffff |
| - .octa 0x0000000003ffffff0000000003ffffff |
| - |
| -.section .rodata.cst32.ORMASK, "aM", @progbits, 32 |
| -.align 32 |
| -ORMASK: .octa 0x00000000010000000000000001000000 |
| - .octa 0x00000000010000000000000001000000 |
| - |
| -.text |
| - |
| -#define h0 0x00(%rdi) |
| -#define h1 0x04(%rdi) |
| -#define h2 0x08(%rdi) |
| -#define h3 0x0c(%rdi) |
| -#define h4 0x10(%rdi) |
| -#define r0 0x00(%rdx) |
| -#define r1 0x04(%rdx) |
| -#define r2 0x08(%rdx) |
| -#define r3 0x0c(%rdx) |
| -#define r4 0x10(%rdx) |
| -#define u0 0x00(%r8) |
| -#define u1 0x04(%r8) |
| -#define u2 0x08(%r8) |
| -#define u3 0x0c(%r8) |
| -#define u4 0x10(%r8) |
| -#define w0 0x18(%r8) |
| -#define w1 0x1c(%r8) |
| -#define w2 0x20(%r8) |
| -#define w3 0x24(%r8) |
| -#define w4 0x28(%r8) |
| -#define y0 0x30(%r8) |
| -#define y1 0x34(%r8) |
| -#define y2 0x38(%r8) |
| -#define y3 0x3c(%r8) |
| -#define y4 0x40(%r8) |
| -#define m %rsi |
| -#define hc0 %ymm0 |
| -#define hc1 %ymm1 |
| -#define hc2 %ymm2 |
| -#define hc3 %ymm3 |
| -#define hc4 %ymm4 |
| -#define hc0x %xmm0 |
| -#define hc1x %xmm1 |
| -#define hc2x %xmm2 |
| -#define hc3x %xmm3 |
| -#define hc4x %xmm4 |
| -#define t1 %ymm5 |
| -#define t2 %ymm6 |
| -#define t1x %xmm5 |
| -#define t2x %xmm6 |
| -#define ruwy0 %ymm7 |
| -#define ruwy1 %ymm8 |
| -#define ruwy2 %ymm9 |
| -#define ruwy3 %ymm10 |
| -#define ruwy4 %ymm11 |
| -#define ruwy0x %xmm7 |
| -#define ruwy1x %xmm8 |
| -#define ruwy2x %xmm9 |
| -#define ruwy3x %xmm10 |
| -#define ruwy4x %xmm11 |
| -#define svxz1 %ymm12 |
| -#define svxz2 %ymm13 |
| -#define svxz3 %ymm14 |
| -#define svxz4 %ymm15 |
| -#define d0 %r9 |
| -#define d1 %r10 |
| -#define d2 %r11 |
| -#define d3 %r12 |
| -#define d4 %r13 |
| - |
| -ENTRY(poly1305_4block_avx2) |
| - # %rdi: Accumulator h[5] |
| - # %rsi: 64 byte input block m |
| - # %rdx: Poly1305 key r[5] |
| - # %rcx: Quadblock count |
| - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], |
| - |
| - # This four-block variant uses loop unrolled block processing. It |
| - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: |
| - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r |
| - |
| - vzeroupper |
| - push %rbx |
| - push %r12 |
| - push %r13 |
| - |
| - # combine r0,u0,w0,y0 |
| - vmovd y0,ruwy0x |
| - vmovd w0,t1x |
| - vpunpcklqdq t1,ruwy0,ruwy0 |
| - vmovd u0,t1x |
| - vmovd r0,t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,ruwy0,ruwy0 |
| - |
| - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 |
| - vmovd y1,ruwy1x |
| - vmovd w1,t1x |
| - vpunpcklqdq t1,ruwy1,ruwy1 |
| - vmovd u1,t1x |
| - vmovd r1,t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,ruwy1,ruwy1 |
| - vpslld $2,ruwy1,svxz1 |
| - vpaddd ruwy1,svxz1,svxz1 |
| - |
| - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 |
| - vmovd y2,ruwy2x |
| - vmovd w2,t1x |
| - vpunpcklqdq t1,ruwy2,ruwy2 |
| - vmovd u2,t1x |
| - vmovd r2,t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,ruwy2,ruwy2 |
| - vpslld $2,ruwy2,svxz2 |
| - vpaddd ruwy2,svxz2,svxz2 |
| - |
| - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 |
| - vmovd y3,ruwy3x |
| - vmovd w3,t1x |
| - vpunpcklqdq t1,ruwy3,ruwy3 |
| - vmovd u3,t1x |
| - vmovd r3,t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,ruwy3,ruwy3 |
| - vpslld $2,ruwy3,svxz3 |
| - vpaddd ruwy3,svxz3,svxz3 |
| - |
| - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 |
| - vmovd y4,ruwy4x |
| - vmovd w4,t1x |
| - vpunpcklqdq t1,ruwy4,ruwy4 |
| - vmovd u4,t1x |
| - vmovd r4,t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,ruwy4,ruwy4 |
| - vpslld $2,ruwy4,svxz4 |
| - vpaddd ruwy4,svxz4,svxz4 |
| - |
| -.Ldoblock4: |
| - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, |
| - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] |
| - vmovd 0x00(m),hc0x |
| - vmovd 0x10(m),t1x |
| - vpunpcklqdq t1,hc0,hc0 |
| - vmovd 0x20(m),t1x |
| - vmovd 0x30(m),t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,hc0,hc0 |
| - vpand ANMASK(%rip),hc0,hc0 |
| - vmovd h0,t1x |
| - vpaddd t1,hc0,hc0 |
| - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, |
| - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] |
| - vmovd 0x03(m),hc1x |
| - vmovd 0x13(m),t1x |
| - vpunpcklqdq t1,hc1,hc1 |
| - vmovd 0x23(m),t1x |
| - vmovd 0x33(m),t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,hc1,hc1 |
| - vpsrld $2,hc1,hc1 |
| - vpand ANMASK(%rip),hc1,hc1 |
| - vmovd h1,t1x |
| - vpaddd t1,hc1,hc1 |
| - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, |
| - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] |
| - vmovd 0x06(m),hc2x |
| - vmovd 0x16(m),t1x |
| - vpunpcklqdq t1,hc2,hc2 |
| - vmovd 0x26(m),t1x |
| - vmovd 0x36(m),t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,hc2,hc2 |
| - vpsrld $4,hc2,hc2 |
| - vpand ANMASK(%rip),hc2,hc2 |
| - vmovd h2,t1x |
| - vpaddd t1,hc2,hc2 |
| - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, |
| - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] |
| - vmovd 0x09(m),hc3x |
| - vmovd 0x19(m),t1x |
| - vpunpcklqdq t1,hc3,hc3 |
| - vmovd 0x29(m),t1x |
| - vmovd 0x39(m),t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,hc3,hc3 |
| - vpsrld $6,hc3,hc3 |
| - vpand ANMASK(%rip),hc3,hc3 |
| - vmovd h3,t1x |
| - vpaddd t1,hc3,hc3 |
| - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), |
| - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] |
| - vmovd 0x0c(m),hc4x |
| - vmovd 0x1c(m),t1x |
| - vpunpcklqdq t1,hc4,hc4 |
| - vmovd 0x2c(m),t1x |
| - vmovd 0x3c(m),t2x |
| - vpunpcklqdq t2,t1,t1 |
| - vperm2i128 $0x20,t1,hc4,hc4 |
| - vpsrld $8,hc4,hc4 |
| - vpor ORMASK(%rip),hc4,hc4 |
| - vmovd h4,t1x |
| - vpaddd t1,hc4,hc4 |
| - |
| - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] |
| - vpmuludq hc0,ruwy0,t1 |
| - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] |
| - vpmuludq hc1,svxz4,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] |
| - vpmuludq hc2,svxz3,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] |
| - vpmuludq hc3,svxz2,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] |
| - vpmuludq hc4,svxz1,t2 |
| - vpaddq t2,t1,t1 |
| - # d0 = t1[0] + t1[1] + t[2] + t[3] |
| - vpermq $0xee,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vpsrldq $8,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vmovq t1x,d0 |
| - |
| - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] |
| - vpmuludq hc0,ruwy1,t1 |
| - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] |
| - vpmuludq hc1,ruwy0,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] |
| - vpmuludq hc2,svxz4,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] |
| - vpmuludq hc3,svxz3,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] |
| - vpmuludq hc4,svxz2,t2 |
| - vpaddq t2,t1,t1 |
| - # d1 = t1[0] + t1[1] + t1[3] + t1[4] |
| - vpermq $0xee,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vpsrldq $8,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vmovq t1x,d1 |
| - |
| - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] |
| - vpmuludq hc0,ruwy2,t1 |
| - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] |
| - vpmuludq hc1,ruwy1,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] |
| - vpmuludq hc2,ruwy0,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] |
| - vpmuludq hc3,svxz4,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] |
| - vpmuludq hc4,svxz3,t2 |
| - vpaddq t2,t1,t1 |
| - # d2 = t1[0] + t1[1] + t1[2] + t1[3] |
| - vpermq $0xee,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vpsrldq $8,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vmovq t1x,d2 |
| - |
| - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] |
| - vpmuludq hc0,ruwy3,t1 |
| - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] |
| - vpmuludq hc1,ruwy2,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] |
| - vpmuludq hc2,ruwy1,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] |
| - vpmuludq hc3,ruwy0,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] |
| - vpmuludq hc4,svxz4,t2 |
| - vpaddq t2,t1,t1 |
| - # d3 = t1[0] + t1[1] + t1[2] + t1[3] |
| - vpermq $0xee,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vpsrldq $8,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vmovq t1x,d3 |
| - |
| - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] |
| - vpmuludq hc0,ruwy4,t1 |
| - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] |
| - vpmuludq hc1,ruwy3,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] |
| - vpmuludq hc2,ruwy2,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] |
| - vpmuludq hc3,ruwy1,t2 |
| - vpaddq t2,t1,t1 |
| - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] |
| - vpmuludq hc4,ruwy0,t2 |
| - vpaddq t2,t1,t1 |
| - # d4 = t1[0] + t1[1] + t1[2] + t1[3] |
| - vpermq $0xee,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vpsrldq $8,t1,t2 |
| - vpaddq t2,t1,t1 |
| - vmovq t1x,d4 |
| - |
| - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> |
| - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small |
| - # amount. Careful: we must not assume the carry bits 'd0 >> 26', |
| - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit |
| - # integers. It's true in a single-block implementation, but not here. |
| - |
| - # d1 += d0 >> 26 |
| - mov d0,%rax |
| - shr $26,%rax |
| - add %rax,d1 |
| - # h0 = d0 & 0x3ffffff |
| - mov d0,%rbx |
| - and $0x3ffffff,%ebx |
| - |
| - # d2 += d1 >> 26 |
| - mov d1,%rax |
| - shr $26,%rax |
| - add %rax,d2 |
| - # h1 = d1 & 0x3ffffff |
| - mov d1,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h1 |
| - |
| - # d3 += d2 >> 26 |
| - mov d2,%rax |
| - shr $26,%rax |
| - add %rax,d3 |
| - # h2 = d2 & 0x3ffffff |
| - mov d2,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h2 |
| - |
| - # d4 += d3 >> 26 |
| - mov d3,%rax |
| - shr $26,%rax |
| - add %rax,d4 |
| - # h3 = d3 & 0x3ffffff |
| - mov d3,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h3 |
| - |
| - # h0 += (d4 >> 26) * 5 |
| - mov d4,%rax |
| - shr $26,%rax |
| - lea (%rax,%rax,4),%rax |
| - add %rax,%rbx |
| - # h4 = d4 & 0x3ffffff |
| - mov d4,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h4 |
| - |
| - # h1 += h0 >> 26 |
| - mov %rbx,%rax |
| - shr $26,%rax |
| - add %eax,h1 |
| - # h0 = h0 & 0x3ffffff |
| - andl $0x3ffffff,%ebx |
| - mov %ebx,h0 |
| - |
| - add $0x40,m |
| - dec %rcx |
| - jnz .Ldoblock4 |
| - |
| - vzeroupper |
| - pop %r13 |
| - pop %r12 |
| - pop %rbx |
| - ret |
| -ENDPROC(poly1305_4block_avx2) |
| --- a/arch/x86/crypto/poly1305-sse2-x86_64.S |
| +++ /dev/null |
| @@ -1,590 +0,0 @@ |
| -/* SPDX-License-Identifier: GPL-2.0-or-later */ |
| -/* |
| - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions |
| - * |
| - * Copyright (C) 2015 Martin Willi |
| - */ |
| - |
| -#include <linux/linkage.h> |
| - |
| -.section .rodata.cst16.ANMASK, "aM", @progbits, 16 |
| -.align 16 |
| -ANMASK: .octa 0x0000000003ffffff0000000003ffffff |
| - |
| -.section .rodata.cst16.ORMASK, "aM", @progbits, 16 |
| -.align 16 |
| -ORMASK: .octa 0x00000000010000000000000001000000 |
| - |
| -.text |
| - |
| -#define h0 0x00(%rdi) |
| -#define h1 0x04(%rdi) |
| -#define h2 0x08(%rdi) |
| -#define h3 0x0c(%rdi) |
| -#define h4 0x10(%rdi) |
| -#define r0 0x00(%rdx) |
| -#define r1 0x04(%rdx) |
| -#define r2 0x08(%rdx) |
| -#define r3 0x0c(%rdx) |
| -#define r4 0x10(%rdx) |
| -#define s1 0x00(%rsp) |
| -#define s2 0x04(%rsp) |
| -#define s3 0x08(%rsp) |
| -#define s4 0x0c(%rsp) |
| -#define m %rsi |
| -#define h01 %xmm0 |
| -#define h23 %xmm1 |
| -#define h44 %xmm2 |
| -#define t1 %xmm3 |
| -#define t2 %xmm4 |
| -#define t3 %xmm5 |
| -#define t4 %xmm6 |
| -#define mask %xmm7 |
| -#define d0 %r8 |
| -#define d1 %r9 |
| -#define d2 %r10 |
| -#define d3 %r11 |
| -#define d4 %r12 |
| - |
| -ENTRY(poly1305_block_sse2) |
| - # %rdi: Accumulator h[5] |
| - # %rsi: 16 byte input block m |
| - # %rdx: Poly1305 key r[5] |
| - # %rcx: Block count |
| - |
| - # This single block variant tries to improve performance by doing two |
| - # multiplications in parallel using SSE instructions. There is quite |
| - # some quardword packing involved, hence the speedup is marginal. |
| - |
| - push %rbx |
| - push %r12 |
| - sub $0x10,%rsp |
| - |
| - # s1..s4 = r1..r4 * 5 |
| - mov r1,%eax |
| - lea (%eax,%eax,4),%eax |
| - mov %eax,s1 |
| - mov r2,%eax |
| - lea (%eax,%eax,4),%eax |
| - mov %eax,s2 |
| - mov r3,%eax |
| - lea (%eax,%eax,4),%eax |
| - mov %eax,s3 |
| - mov r4,%eax |
| - lea (%eax,%eax,4),%eax |
| - mov %eax,s4 |
| - |
| - movdqa ANMASK(%rip),mask |
| - |
| -.Ldoblock: |
| - # h01 = [0, h1, 0, h0] |
| - # h23 = [0, h3, 0, h2] |
| - # h44 = [0, h4, 0, h4] |
| - movd h0,h01 |
| - movd h1,t1 |
| - movd h2,h23 |
| - movd h3,t2 |
| - movd h4,h44 |
| - punpcklqdq t1,h01 |
| - punpcklqdq t2,h23 |
| - punpcklqdq h44,h44 |
| - |
| - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] |
| - movd 0x00(m),t1 |
| - movd 0x03(m),t2 |
| - psrld $2,t2 |
| - punpcklqdq t2,t1 |
| - pand mask,t1 |
| - paddd t1,h01 |
| - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] |
| - movd 0x06(m),t1 |
| - movd 0x09(m),t2 |
| - psrld $4,t1 |
| - psrld $6,t2 |
| - punpcklqdq t2,t1 |
| - pand mask,t1 |
| - paddd t1,h23 |
| - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] |
| - mov 0x0c(m),%eax |
| - shr $8,%eax |
| - or $0x01000000,%eax |
| - movd %eax,t1 |
| - pshufd $0xc4,t1,t1 |
| - paddd t1,h44 |
| - |
| - # t1[0] = h0 * r0 + h2 * s3 |
| - # t1[1] = h1 * s4 + h3 * s2 |
| - movd r0,t1 |
| - movd s4,t2 |
| - punpcklqdq t2,t1 |
| - pmuludq h01,t1 |
| - movd s3,t2 |
| - movd s2,t3 |
| - punpcklqdq t3,t2 |
| - pmuludq h23,t2 |
| - paddq t2,t1 |
| - # t2[0] = h0 * r1 + h2 * s4 |
| - # t2[1] = h1 * r0 + h3 * s3 |
| - movd r1,t2 |
| - movd r0,t3 |
| - punpcklqdq t3,t2 |
| - pmuludq h01,t2 |
| - movd s4,t3 |
| - movd s3,t4 |
| - punpcklqdq t4,t3 |
| - pmuludq h23,t3 |
| - paddq t3,t2 |
| - # t3[0] = h4 * s1 |
| - # t3[1] = h4 * s2 |
| - movd s1,t3 |
| - movd s2,t4 |
| - punpcklqdq t4,t3 |
| - pmuludq h44,t3 |
| - # d0 = t1[0] + t1[1] + t3[0] |
| - # d1 = t2[0] + t2[1] + t3[1] |
| - movdqa t1,t4 |
| - punpcklqdq t2,t4 |
| - punpckhqdq t2,t1 |
| - paddq t4,t1 |
| - paddq t3,t1 |
| - movq t1,d0 |
| - psrldq $8,t1 |
| - movq t1,d1 |
| - |
| - # t1[0] = h0 * r2 + h2 * r0 |
| - # t1[1] = h1 * r1 + h3 * s4 |
| - movd r2,t1 |
| - movd r1,t2 |
| - punpcklqdq t2,t1 |
| - pmuludq h01,t1 |
| - movd r0,t2 |
| - movd s4,t3 |
| - punpcklqdq t3,t2 |
| - pmuludq h23,t2 |
| - paddq t2,t1 |
| - # t2[0] = h0 * r3 + h2 * r1 |
| - # t2[1] = h1 * r2 + h3 * r0 |
| - movd r3,t2 |
| - movd r2,t3 |
| - punpcklqdq t3,t2 |
| - pmuludq h01,t2 |
| - movd r1,t3 |
| - movd r0,t4 |
| - punpcklqdq t4,t3 |
| - pmuludq h23,t3 |
| - paddq t3,t2 |
| - # t3[0] = h4 * s3 |
| - # t3[1] = h4 * s4 |
| - movd s3,t3 |
| - movd s4,t4 |
| - punpcklqdq t4,t3 |
| - pmuludq h44,t3 |
| - # d2 = t1[0] + t1[1] + t3[0] |
| - # d3 = t2[0] + t2[1] + t3[1] |
| - movdqa t1,t4 |
| - punpcklqdq t2,t4 |
| - punpckhqdq t2,t1 |
| - paddq t4,t1 |
| - paddq t3,t1 |
| - movq t1,d2 |
| - psrldq $8,t1 |
| - movq t1,d3 |
| - |
| - # t1[0] = h0 * r4 + h2 * r2 |
| - # t1[1] = h1 * r3 + h3 * r1 |
| - movd r4,t1 |
| - movd r3,t2 |
| - punpcklqdq t2,t1 |
| - pmuludq h01,t1 |
| - movd r2,t2 |
| - movd r1,t3 |
| - punpcklqdq t3,t2 |
| - pmuludq h23,t2 |
| - paddq t2,t1 |
| - # t3[0] = h4 * r0 |
| - movd r0,t3 |
| - pmuludq h44,t3 |
| - # d4 = t1[0] + t1[1] + t3[0] |
| - movdqa t1,t4 |
| - psrldq $8,t4 |
| - paddq t4,t1 |
| - paddq t3,t1 |
| - movq t1,d4 |
| - |
| - # d1 += d0 >> 26 |
| - mov d0,%rax |
| - shr $26,%rax |
| - add %rax,d1 |
| - # h0 = d0 & 0x3ffffff |
| - mov d0,%rbx |
| - and $0x3ffffff,%ebx |
| - |
| - # d2 += d1 >> 26 |
| - mov d1,%rax |
| - shr $26,%rax |
| - add %rax,d2 |
| - # h1 = d1 & 0x3ffffff |
| - mov d1,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h1 |
| - |
| - # d3 += d2 >> 26 |
| - mov d2,%rax |
| - shr $26,%rax |
| - add %rax,d3 |
| - # h2 = d2 & 0x3ffffff |
| - mov d2,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h2 |
| - |
| - # d4 += d3 >> 26 |
| - mov d3,%rax |
| - shr $26,%rax |
| - add %rax,d4 |
| - # h3 = d3 & 0x3ffffff |
| - mov d3,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h3 |
| - |
| - # h0 += (d4 >> 26) * 5 |
| - mov d4,%rax |
| - shr $26,%rax |
| - lea (%rax,%rax,4),%rax |
| - add %rax,%rbx |
| - # h4 = d4 & 0x3ffffff |
| - mov d4,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h4 |
| - |
| - # h1 += h0 >> 26 |
| - mov %rbx,%rax |
| - shr $26,%rax |
| - add %eax,h1 |
| - # h0 = h0 & 0x3ffffff |
| - andl $0x3ffffff,%ebx |
| - mov %ebx,h0 |
| - |
| - add $0x10,m |
| - dec %rcx |
| - jnz .Ldoblock |
| - |
| - # Zeroing of key material |
| - mov %rcx,0x00(%rsp) |
| - mov %rcx,0x08(%rsp) |
| - |
| - add $0x10,%rsp |
| - pop %r12 |
| - pop %rbx |
| - ret |
| -ENDPROC(poly1305_block_sse2) |
| - |
| - |
| -#define u0 0x00(%r8) |
| -#define u1 0x04(%r8) |
| -#define u2 0x08(%r8) |
| -#define u3 0x0c(%r8) |
| -#define u4 0x10(%r8) |
| -#define hc0 %xmm0 |
| -#define hc1 %xmm1 |
| -#define hc2 %xmm2 |
| -#define hc3 %xmm5 |
| -#define hc4 %xmm6 |
| -#define ru0 %xmm7 |
| -#define ru1 %xmm8 |
| -#define ru2 %xmm9 |
| -#define ru3 %xmm10 |
| -#define ru4 %xmm11 |
| -#define sv1 %xmm12 |
| -#define sv2 %xmm13 |
| -#define sv3 %xmm14 |
| -#define sv4 %xmm15 |
| -#undef d0 |
| -#define d0 %r13 |
| - |
| -ENTRY(poly1305_2block_sse2) |
| - # %rdi: Accumulator h[5] |
| - # %rsi: 16 byte input block m |
| - # %rdx: Poly1305 key r[5] |
| - # %rcx: Doubleblock count |
| - # %r8: Poly1305 derived key r^2 u[5] |
| - |
| - # This two-block variant further improves performance by using loop |
| - # unrolled block processing. This is more straight forward and does |
| - # less byte shuffling, but requires a second Poly1305 key r^2: |
| - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r |
| - |
| - push %rbx |
| - push %r12 |
| - push %r13 |
| - |
| - # combine r0,u0 |
| - movd u0,ru0 |
| - movd r0,t1 |
| - punpcklqdq t1,ru0 |
| - |
| - # combine r1,u1 and s1=r1*5,v1=u1*5 |
| - movd u1,ru1 |
| - movd r1,t1 |
| - punpcklqdq t1,ru1 |
| - movdqa ru1,sv1 |
| - pslld $2,sv1 |
| - paddd ru1,sv1 |
| - |
| - # combine r2,u2 and s2=r2*5,v2=u2*5 |
| - movd u2,ru2 |
| - movd r2,t1 |
| - punpcklqdq t1,ru2 |
| - movdqa ru2,sv2 |
| - pslld $2,sv2 |
| - paddd ru2,sv2 |
| - |
| - # combine r3,u3 and s3=r3*5,v3=u3*5 |
| - movd u3,ru3 |
| - movd r3,t1 |
| - punpcklqdq t1,ru3 |
| - movdqa ru3,sv3 |
| - pslld $2,sv3 |
| - paddd ru3,sv3 |
| - |
| - # combine r4,u4 and s4=r4*5,v4=u4*5 |
| - movd u4,ru4 |
| - movd r4,t1 |
| - punpcklqdq t1,ru4 |
| - movdqa ru4,sv4 |
| - pslld $2,sv4 |
| - paddd ru4,sv4 |
| - |
| -.Ldoblock2: |
| - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] |
| - movd 0x00(m),hc0 |
| - movd 0x10(m),t1 |
| - punpcklqdq t1,hc0 |
| - pand ANMASK(%rip),hc0 |
| - movd h0,t1 |
| - paddd t1,hc0 |
| - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] |
| - movd 0x03(m),hc1 |
| - movd 0x13(m),t1 |
| - punpcklqdq t1,hc1 |
| - psrld $2,hc1 |
| - pand ANMASK(%rip),hc1 |
| - movd h1,t1 |
| - paddd t1,hc1 |
| - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] |
| - movd 0x06(m),hc2 |
| - movd 0x16(m),t1 |
| - punpcklqdq t1,hc2 |
| - psrld $4,hc2 |
| - pand ANMASK(%rip),hc2 |
| - movd h2,t1 |
| - paddd t1,hc2 |
| - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] |
| - movd 0x09(m),hc3 |
| - movd 0x19(m),t1 |
| - punpcklqdq t1,hc3 |
| - psrld $6,hc3 |
| - pand ANMASK(%rip),hc3 |
| - movd h3,t1 |
| - paddd t1,hc3 |
| - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] |
| - movd 0x0c(m),hc4 |
| - movd 0x1c(m),t1 |
| - punpcklqdq t1,hc4 |
| - psrld $8,hc4 |
| - por ORMASK(%rip),hc4 |
| - movd h4,t1 |
| - paddd t1,hc4 |
| - |
| - # t1 = [ hc0[1] * r0, hc0[0] * u0 ] |
| - movdqa ru0,t1 |
| - pmuludq hc0,t1 |
| - # t1 += [ hc1[1] * s4, hc1[0] * v4 ] |
| - movdqa sv4,t2 |
| - pmuludq hc1,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc2[1] * s3, hc2[0] * v3 ] |
| - movdqa sv3,t2 |
| - pmuludq hc2,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc3[1] * s2, hc3[0] * v2 ] |
| - movdqa sv2,t2 |
| - pmuludq hc3,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc4[1] * s1, hc4[0] * v1 ] |
| - movdqa sv1,t2 |
| - pmuludq hc4,t2 |
| - paddq t2,t1 |
| - # d0 = t1[0] + t1[1] |
| - movdqa t1,t2 |
| - psrldq $8,t2 |
| - paddq t2,t1 |
| - movq t1,d0 |
| - |
| - # t1 = [ hc0[1] * r1, hc0[0] * u1 ] |
| - movdqa ru1,t1 |
| - pmuludq hc0,t1 |
| - # t1 += [ hc1[1] * r0, hc1[0] * u0 ] |
| - movdqa ru0,t2 |
| - pmuludq hc1,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc2[1] * s4, hc2[0] * v4 ] |
| - movdqa sv4,t2 |
| - pmuludq hc2,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc3[1] * s3, hc3[0] * v3 ] |
| - movdqa sv3,t2 |
| - pmuludq hc3,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc4[1] * s2, hc4[0] * v2 ] |
| - movdqa sv2,t2 |
| - pmuludq hc4,t2 |
| - paddq t2,t1 |
| - # d1 = t1[0] + t1[1] |
| - movdqa t1,t2 |
| - psrldq $8,t2 |
| - paddq t2,t1 |
| - movq t1,d1 |
| - |
| - # t1 = [ hc0[1] * r2, hc0[0] * u2 ] |
| - movdqa ru2,t1 |
| - pmuludq hc0,t1 |
| - # t1 += [ hc1[1] * r1, hc1[0] * u1 ] |
| - movdqa ru1,t2 |
| - pmuludq hc1,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc2[1] * r0, hc2[0] * u0 ] |
| - movdqa ru0,t2 |
| - pmuludq hc2,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc3[1] * s4, hc3[0] * v4 ] |
| - movdqa sv4,t2 |
| - pmuludq hc3,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc4[1] * s3, hc4[0] * v3 ] |
| - movdqa sv3,t2 |
| - pmuludq hc4,t2 |
| - paddq t2,t1 |
| - # d2 = t1[0] + t1[1] |
| - movdqa t1,t2 |
| - psrldq $8,t2 |
| - paddq t2,t1 |
| - movq t1,d2 |
| - |
| - # t1 = [ hc0[1] * r3, hc0[0] * u3 ] |
| - movdqa ru3,t1 |
| - pmuludq hc0,t1 |
| - # t1 += [ hc1[1] * r2, hc1[0] * u2 ] |
| - movdqa ru2,t2 |
| - pmuludq hc1,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc2[1] * r1, hc2[0] * u1 ] |
| - movdqa ru1,t2 |
| - pmuludq hc2,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc3[1] * r0, hc3[0] * u0 ] |
| - movdqa ru0,t2 |
| - pmuludq hc3,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc4[1] * s4, hc4[0] * v4 ] |
| - movdqa sv4,t2 |
| - pmuludq hc4,t2 |
| - paddq t2,t1 |
| - # d3 = t1[0] + t1[1] |
| - movdqa t1,t2 |
| - psrldq $8,t2 |
| - paddq t2,t1 |
| - movq t1,d3 |
| - |
| - # t1 = [ hc0[1] * r4, hc0[0] * u4 ] |
| - movdqa ru4,t1 |
| - pmuludq hc0,t1 |
| - # t1 += [ hc1[1] * r3, hc1[0] * u3 ] |
| - movdqa ru3,t2 |
| - pmuludq hc1,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc2[1] * r2, hc2[0] * u2 ] |
| - movdqa ru2,t2 |
| - pmuludq hc2,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc3[1] * r1, hc3[0] * u1 ] |
| - movdqa ru1,t2 |
| - pmuludq hc3,t2 |
| - paddq t2,t1 |
| - # t1 += [ hc4[1] * r0, hc4[0] * u0 ] |
| - movdqa ru0,t2 |
| - pmuludq hc4,t2 |
| - paddq t2,t1 |
| - # d4 = t1[0] + t1[1] |
| - movdqa t1,t2 |
| - psrldq $8,t2 |
| - paddq t2,t1 |
| - movq t1,d4 |
| - |
| - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> |
| - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small |
| - # amount. Careful: we must not assume the carry bits 'd0 >> 26', |
| - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit |
| - # integers. It's true in a single-block implementation, but not here. |
| - |
| - # d1 += d0 >> 26 |
| - mov d0,%rax |
| - shr $26,%rax |
| - add %rax,d1 |
| - # h0 = d0 & 0x3ffffff |
| - mov d0,%rbx |
| - and $0x3ffffff,%ebx |
| - |
| - # d2 += d1 >> 26 |
| - mov d1,%rax |
| - shr $26,%rax |
| - add %rax,d2 |
| - # h1 = d1 & 0x3ffffff |
| - mov d1,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h1 |
| - |
| - # d3 += d2 >> 26 |
| - mov d2,%rax |
| - shr $26,%rax |
| - add %rax,d3 |
| - # h2 = d2 & 0x3ffffff |
| - mov d2,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h2 |
| - |
| - # d4 += d3 >> 26 |
| - mov d3,%rax |
| - shr $26,%rax |
| - add %rax,d4 |
| - # h3 = d3 & 0x3ffffff |
| - mov d3,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h3 |
| - |
| - # h0 += (d4 >> 26) * 5 |
| - mov d4,%rax |
| - shr $26,%rax |
| - lea (%rax,%rax,4),%rax |
| - add %rax,%rbx |
| - # h4 = d4 & 0x3ffffff |
| - mov d4,%rax |
| - and $0x3ffffff,%eax |
| - mov %eax,h4 |
| - |
| - # h1 += h0 >> 26 |
| - mov %rbx,%rax |
| - shr $26,%rax |
| - add %eax,h1 |
| - # h0 = h0 & 0x3ffffff |
| - andl $0x3ffffff,%ebx |
| - mov %ebx,h0 |
| - |
| - add $0x20,m |
| - dec %rcx |
| - jnz .Ldoblock2 |
| - |
| - pop %r13 |
| - pop %r12 |
| - pop %rbx |
| - ret |
| -ENDPROC(poly1305_2block_sse2) |
| --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl |
| +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl |
| @@ -1,11 +1,14 @@ |
| -#! /usr/bin/env perl |
| -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. |
| +#!/usr/bin/env perl |
| +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
| # |
| -# Licensed under the OpenSSL license (the "License"). You may not use |
| -# this file except in compliance with the License. You can obtain a copy |
| -# in the file LICENSE in the source distribution or at |
| -# https://www.openssl.org/source/license.html |
| - |
| +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. |
| +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. |
| +# |
| +# This code is taken from the OpenSSL project but the author, Andy Polyakov, |
| +# has relicensed it under the licenses specified in the SPDX header above. |
| +# The original headers, including the original license headers, are |
| +# included below for completeness. |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| @@ -32,7 +35,7 @@ |
| # Skylake-X system performance. Since we are likely to suppress |
| # AVX512F capability flag [at least on Skylake-X], conversion serves |
| # as kind of "investment protection". Note that next *lake processor, |
| -# Cannolake, has AVX512IFMA code path to execute... |
| +# Cannonlake, has AVX512IFMA code path to execute... |
| # |
| # Numbers are cycles per processed byte with poly1305_blocks alone, |
| # measured with rdtsc at fixed clock frequency. |
| @@ -68,39 +71,114 @@ $output = shift; |
| if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| |
| $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| +$kernel=0; $kernel=1 if (!$flavour && !$output); |
| |
| -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| -die "can't locate x86_64-xlate.pl"; |
| - |
| -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
| - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { |
| - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); |
| +if (!$kernel) { |
| + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| + die "can't locate x86_64-xlate.pl"; |
| + |
| + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| + *STDOUT=*OUT; |
| + |
| + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
| + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { |
| + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); |
| + } |
| + |
| + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && |
| + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
| + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); |
| + $avx += 1 if ($1==2.11 && $2>=8); |
| + } |
| + |
| + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && |
| + `ml64 2>&1` =~ /Version ([0-9]+)\./) { |
| + $avx = ($1>=10) + ($1>=11); |
| + } |
| + |
| + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { |
| + $avx = ($2>=3.0) + ($2>3.0); |
| + } |
| +} else { |
| + $avx = 4; # The kernel uses ifdefs for this. |
| } |
| |
| -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && |
| - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
| - $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); |
| - $avx += 2 if ($1==2.11 && $2>=8); |
| +sub declare_function() { |
| + my ($name, $align, $nargs) = @_; |
| + if($kernel) { |
| + $code .= ".align $align\n"; |
| + $code .= "ENTRY($name)\n"; |
| + $code .= ".L$name:\n"; |
| + } else { |
| + $code .= ".globl $name\n"; |
| + $code .= ".type $name,\@function,$nargs\n"; |
| + $code .= ".align $align\n"; |
| + $code .= "$name:\n"; |
| + } |
| } |
| |
| -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && |
| - `ml64 2>&1` =~ /Version ([0-9]+)\./) { |
| - $avx = ($1>=10) + ($1>=12); |
| +sub end_function() { |
| + my ($name) = @_; |
| + if($kernel) { |
| + $code .= "ENDPROC($name)\n"; |
| + } else { |
| + $code .= ".size $name,.-$name\n"; |
| + } |
| } |
| |
| -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { |
| - $avx = ($2>=3.0) + ($2>3.0); |
| -} |
| +$code.=<<___ if $kernel; |
| +#include <linux/linkage.h> |
| +___ |
| |
| -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| -*STDOUT=*OUT; |
| +if ($avx) { |
| +$code.=<<___ if $kernel; |
| +.section .rodata |
| +___ |
| +$code.=<<___; |
| +.align 64 |
| +.Lconst: |
| +.Lmask24: |
| +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 |
| +.L129: |
| +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 |
| +.Lmask26: |
| +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 |
| +.Lpermd_avx2: |
| +.long 2,2,2,3,2,0,2,1 |
| +.Lpermd_avx512: |
| +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 |
| + |
| +.L2_44_inp_permd: |
| +.long 0,1,1,2,2,3,7,7 |
| +.L2_44_inp_shift: |
| +.quad 0,12,24,64 |
| +.L2_44_mask: |
| +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff |
| +.L2_44_shift_rgt: |
| +.quad 44,44,42,64 |
| +.L2_44_shift_lft: |
| +.quad 8,8,10,64 |
| + |
| +.align 64 |
| +.Lx_mask44: |
| +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| +.Lx_mask42: |
| +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| +___ |
| +} |
| +$code.=<<___ if (!$kernel); |
| +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| +.align 16 |
| +___ |
| |
| my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); |
| my ($mac,$nonce)=($inp,$len); # *_emit arguments |
| -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); |
| -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); |
| +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); |
| +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); |
| |
| sub poly1305_iteration { |
| # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 |
| @@ -155,19 +233,19 @@ ___ |
| |
| $code.=<<___; |
| .text |
| - |
| +___ |
| +$code.=<<___ if (!$kernel); |
| .extern OPENSSL_ia32cap_P |
| |
| -.globl poly1305_init |
| -.hidden poly1305_init |
| -.globl poly1305_blocks |
| -.hidden poly1305_blocks |
| -.globl poly1305_emit |
| -.hidden poly1305_emit |
| - |
| -.type poly1305_init,\@function,3 |
| -.align 32 |
| -poly1305_init: |
| +.globl poly1305_init_x86_64 |
| +.hidden poly1305_init_x86_64 |
| +.globl poly1305_blocks_x86_64 |
| +.hidden poly1305_blocks_x86_64 |
| +.globl poly1305_emit_x86_64 |
| +.hidden poly1305_emit_x86_64 |
| +___ |
| +&declare_function("poly1305_init_x86_64", 32, 3); |
| +$code.=<<___; |
| xor %rax,%rax |
| mov %rax,0($ctx) # initialize hash value |
| mov %rax,8($ctx) |
| @@ -175,11 +253,12 @@ poly1305_init: |
| |
| cmp \$0,$inp |
| je .Lno_key |
| - |
| - lea poly1305_blocks(%rip),%r10 |
| - lea poly1305_emit(%rip),%r11 |
| ___ |
| -$code.=<<___ if ($avx); |
| +$code.=<<___ if (!$kernel); |
| + lea poly1305_blocks_x86_64(%rip),%r10 |
| + lea poly1305_emit_x86_64(%rip),%r11 |
| +___ |
| +$code.=<<___ if (!$kernel && $avx); |
| mov OPENSSL_ia32cap_P+4(%rip),%r9 |
| lea poly1305_blocks_avx(%rip),%rax |
| lea poly1305_emit_avx(%rip),%rcx |
| @@ -187,12 +266,12 @@ $code.=<<___ if ($avx); |
| cmovc %rax,%r10 |
| cmovc %rcx,%r11 |
| ___ |
| -$code.=<<___ if ($avx>1); |
| +$code.=<<___ if (!$kernel && $avx>1); |
| lea poly1305_blocks_avx2(%rip),%rax |
| bt \$`5+32`,%r9 # AVX2? |
| cmovc %rax,%r10 |
| ___ |
| -$code.=<<___ if ($avx>3); |
| +$code.=<<___ if (!$kernel && $avx>3); |
| mov \$`(1<<31|1<<21|1<<16)`,%rax |
| shr \$32,%r9 |
| and %rax,%r9 |
| @@ -207,11 +286,11 @@ $code.=<<___; |
| mov %rax,24($ctx) |
| mov %rcx,32($ctx) |
| ___ |
| -$code.=<<___ if ($flavour !~ /elf32/); |
| +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); |
| mov %r10,0(%rdx) |
| mov %r11,8(%rdx) |
| ___ |
| -$code.=<<___ if ($flavour =~ /elf32/); |
| +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); |
| mov %r10d,0(%rdx) |
| mov %r11d,4(%rdx) |
| ___ |
| @@ -219,11 +298,11 @@ $code.=<<___; |
| mov \$1,%eax |
| .Lno_key: |
| ret |
| -.size poly1305_init,.-poly1305_init |
| +___ |
| +&end_function("poly1305_init_x86_64"); |
| |
| -.type poly1305_blocks,\@function,4 |
| -.align 32 |
| -poly1305_blocks: |
| +&declare_function("poly1305_blocks_x86_64", 32, 4); |
| +$code.=<<___; |
| .cfi_startproc |
| .Lblocks: |
| shr \$4,$len |
| @@ -231,8 +310,6 @@ poly1305_blocks: |
| |
| push %rbx |
| .cfi_push %rbx |
| - push %rbp |
| -.cfi_push %rbp |
| push %r12 |
| .cfi_push %r12 |
| push %r13 |
| @@ -241,6 +318,8 @@ poly1305_blocks: |
| .cfi_push %r14 |
| push %r15 |
| .cfi_push %r15 |
| + push $ctx |
| +.cfi_push $ctx |
| .Lblocks_body: |
| |
| mov $len,%r15 # reassign $len |
| @@ -265,26 +344,29 @@ poly1305_blocks: |
| lea 16($inp),$inp |
| adc $padbit,$h2 |
| ___ |
| + |
| &poly1305_iteration(); |
| + |
| $code.=<<___; |
| mov $r1,%rax |
| dec %r15 # len-=16 |
| jnz .Loop |
| |
| + mov 0(%rsp),$ctx |
| +.cfi_restore $ctx |
| + |
| mov $h0,0($ctx) # store hash value |
| mov $h1,8($ctx) |
| mov $h2,16($ctx) |
| |
| - mov 0(%rsp),%r15 |
| + mov 8(%rsp),%r15 |
| .cfi_restore %r15 |
| - mov 8(%rsp),%r14 |
| + mov 16(%rsp),%r14 |
| .cfi_restore %r14 |
| - mov 16(%rsp),%r13 |
| + mov 24(%rsp),%r13 |
| .cfi_restore %r13 |
| - mov 24(%rsp),%r12 |
| + mov 32(%rsp),%r12 |
| .cfi_restore %r12 |
| - mov 32(%rsp),%rbp |
| -.cfi_restore %rbp |
| mov 40(%rsp),%rbx |
| .cfi_restore %rbx |
| lea 48(%rsp),%rsp |
| @@ -293,11 +375,11 @@ $code.=<<___; |
| .Lblocks_epilogue: |
| ret |
| .cfi_endproc |
| -.size poly1305_blocks,.-poly1305_blocks |
| +___ |
| +&end_function("poly1305_blocks_x86_64"); |
| |
| -.type poly1305_emit,\@function,3 |
| -.align 32 |
| -poly1305_emit: |
| +&declare_function("poly1305_emit_x86_64", 32, 3); |
| +$code.=<<___; |
| .Lemit: |
| mov 0($ctx),%r8 # load hash value |
| mov 8($ctx),%r9 |
| @@ -318,10 +400,14 @@ poly1305_emit: |
| mov %rcx,8($mac) |
| |
| ret |
| -.size poly1305_emit,.-poly1305_emit |
| ___ |
| +&end_function("poly1305_emit_x86_64"); |
| if ($avx) { |
| |
| +if($kernel) { |
| + $code .= "#ifdef CONFIG_AS_AVX\n"; |
| +} |
| + |
| ######################################################################## |
| # Layout of opaque area is following. |
| # |
| @@ -342,15 +428,19 @@ $code.=<<___; |
| .type __poly1305_block,\@abi-omnipotent |
| .align 32 |
| __poly1305_block: |
| + push $ctx |
| ___ |
| &poly1305_iteration(); |
| $code.=<<___; |
| + pop $ctx |
| ret |
| .size __poly1305_block,.-__poly1305_block |
| |
| .type __poly1305_init_avx,\@abi-omnipotent |
| .align 32 |
| __poly1305_init_avx: |
| + push %rbp |
| + mov %rsp,%rbp |
| mov $r0,$h0 |
| mov $r1,$h1 |
| xor $h2,$h2 |
| @@ -507,12 +597,13 @@ __poly1305_init_avx: |
| mov $d1#d,`16*8+8-64`($ctx) |
| |
| lea -48-64($ctx),$ctx # size [de-]optimization |
| + pop %rbp |
| ret |
| .size __poly1305_init_avx,.-__poly1305_init_avx |
| +___ |
| |
| -.type poly1305_blocks_avx,\@function,4 |
| -.align 32 |
| -poly1305_blocks_avx: |
| +&declare_function("poly1305_blocks_avx", 32, 4); |
| +$code.=<<___; |
| .cfi_startproc |
| mov 20($ctx),%r8d # is_base2_26 |
| cmp \$128,$len |
| @@ -532,10 +623,11 @@ poly1305_blocks_avx: |
| test \$31,$len |
| jz .Leven_avx |
| |
| - push %rbx |
| -.cfi_push %rbx |
| push %rbp |
| .cfi_push %rbp |
| + mov %rsp,%rbp |
| + push %rbx |
| +.cfi_push %rbx |
| push %r12 |
| .cfi_push %r12 |
| push %r13 |
| @@ -645,20 +737,18 @@ poly1305_blocks_avx: |
| mov $h2#d,16($ctx) |
| .align 16 |
| .Ldone_avx: |
| - mov 0(%rsp),%r15 |
| + pop %r15 |
| .cfi_restore %r15 |
| - mov 8(%rsp),%r14 |
| + pop %r14 |
| .cfi_restore %r14 |
| - mov 16(%rsp),%r13 |
| + pop %r13 |
| .cfi_restore %r13 |
| - mov 24(%rsp),%r12 |
| + pop %r12 |
| .cfi_restore %r12 |
| - mov 32(%rsp),%rbp |
| -.cfi_restore %rbp |
| - mov 40(%rsp),%rbx |
| + pop %rbx |
| .cfi_restore %rbx |
| - lea 48(%rsp),%rsp |
| -.cfi_adjust_cfa_offset -48 |
| + pop %rbp |
| +.cfi_restore %rbp |
| .Lno_data_avx: |
| .Lblocks_avx_epilogue: |
| ret |
| @@ -667,10 +757,11 @@ poly1305_blocks_avx: |
| .align 32 |
| .Lbase2_64_avx: |
| .cfi_startproc |
| - push %rbx |
| -.cfi_push %rbx |
| push %rbp |
| .cfi_push %rbp |
| + mov %rsp,%rbp |
| + push %rbx |
| +.cfi_push %rbx |
| push %r12 |
| .cfi_push %r12 |
| push %r13 |
| @@ -736,22 +827,18 @@ poly1305_blocks_avx: |
| |
| .Lproceed_avx: |
| mov %r15,$len |
| - |
| - mov 0(%rsp),%r15 |
| + pop %r15 |
| .cfi_restore %r15 |
| - mov 8(%rsp),%r14 |
| + pop %r14 |
| .cfi_restore %r14 |
| - mov 16(%rsp),%r13 |
| + pop %r13 |
| .cfi_restore %r13 |
| - mov 24(%rsp),%r12 |
| + pop %r12 |
| .cfi_restore %r12 |
| - mov 32(%rsp),%rbp |
| -.cfi_restore %rbp |
| - mov 40(%rsp),%rbx |
| + pop %rbx |
| .cfi_restore %rbx |
| - lea 48(%rsp),%rax |
| - lea 48(%rsp),%rsp |
| -.cfi_adjust_cfa_offset -48 |
| + pop %rbp |
| +.cfi_restore %rbp |
| .Lbase2_64_avx_epilogue: |
| jmp .Ldo_avx |
| .cfi_endproc |
| @@ -768,8 +855,11 @@ poly1305_blocks_avx: |
| .Ldo_avx: |
| ___ |
| $code.=<<___ if (!$win64); |
| + lea 8(%rsp),%r10 |
| +.cfi_def_cfa_register %r10 |
| + and \$-32,%rsp |
| + sub \$-8,%rsp |
| lea -0x58(%rsp),%r11 |
| -.cfi_def_cfa %r11,0x60 |
| sub \$0x178,%rsp |
| ___ |
| $code.=<<___ if ($win64); |
| @@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64); |
| .Ldo_avx_epilogue: |
| ___ |
| $code.=<<___ if (!$win64); |
| - lea 0x58(%r11),%rsp |
| -.cfi_def_cfa %rsp,8 |
| + lea -8(%r10),%rsp |
| +.cfi_def_cfa_register %rsp |
| ___ |
| $code.=<<___; |
| vzeroupper |
| ret |
| .cfi_endproc |
| -.size poly1305_blocks_avx,.-poly1305_blocks_avx |
| +___ |
| +&end_function("poly1305_blocks_avx"); |
| |
| -.type poly1305_emit_avx,\@function,3 |
| -.align 32 |
| -poly1305_emit_avx: |
| +&declare_function("poly1305_emit_avx", 32, 3); |
| +$code.=<<___; |
| cmpl \$0,20($ctx) # is_base2_26? |
| je .Lemit |
| |
| @@ -1423,41 +1513,51 @@ poly1305_emit_avx: |
| mov %rcx,8($mac) |
| |
| ret |
| -.size poly1305_emit_avx,.-poly1305_emit_avx |
| ___ |
| +&end_function("poly1305_emit_avx"); |
| + |
| +if ($kernel) { |
| + $code .= "#endif\n"; |
| +} |
| |
| if ($avx>1) { |
| + |
| +if ($kernel) { |
| + $code .= "#ifdef CONFIG_AS_AVX2\n"; |
| +} |
| + |
| my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = |
| map("%ymm$_",(0..15)); |
| my $S4=$MASK; |
| |
| +sub poly1305_blocks_avxN { |
| + my ($avx512) = @_; |
| + my $suffix = $avx512 ? "_avx512" : ""; |
| $code.=<<___; |
| -.type poly1305_blocks_avx2,\@function,4 |
| -.align 32 |
| -poly1305_blocks_avx2: |
| .cfi_startproc |
| mov 20($ctx),%r8d # is_base2_26 |
| cmp \$128,$len |
| - jae .Lblocks_avx2 |
| + jae .Lblocks_avx2$suffix |
| test %r8d,%r8d |
| jz .Lblocks |
| |
| -.Lblocks_avx2: |
| +.Lblocks_avx2$suffix: |
| and \$-16,$len |
| - jz .Lno_data_avx2 |
| + jz .Lno_data_avx2$suffix |
| |
| vzeroupper |
| |
| test %r8d,%r8d |
| - jz .Lbase2_64_avx2 |
| + jz .Lbase2_64_avx2$suffix |
| |
| test \$63,$len |
| - jz .Leven_avx2 |
| + jz .Leven_avx2$suffix |
| |
| - push %rbx |
| -.cfi_push %rbx |
| push %rbp |
| .cfi_push %rbp |
| + mov %rsp,%rbp |
| + push %rbx |
| +.cfi_push %rbx |
| push %r12 |
| .cfi_push %r12 |
| push %r13 |
| @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2: |
| .cfi_push %r14 |
| push %r15 |
| .cfi_push %r15 |
| -.Lblocks_avx2_body: |
| +.Lblocks_avx2_body$suffix: |
| |
| mov $len,%r15 # reassign $len |
| |
| @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2: |
| shr \$2,$s1 |
| add $r1,$s1 # s1 = r1 + (r1 >> 2) |
| |
| -.Lbase2_26_pre_avx2: |
| +.Lbase2_26_pre_avx2$suffix: |
| add 0($inp),$h0 # accumulate input |
| adc 8($inp),$h1 |
| lea 16($inp),$inp |
| @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2: |
| mov $r1,%rax |
| |
| test \$63,%r15 |
| - jnz .Lbase2_26_pre_avx2 |
| + jnz .Lbase2_26_pre_avx2$suffix |
| |
| test $padbit,$padbit # if $padbit is zero, |
| - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format |
| + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format |
| |
| ################################# base 2^64 -> base 2^26 |
| mov $h0,%rax |
| @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2: |
| or $r1,$h2 # h[4] |
| |
| test %r15,%r15 |
| - jz .Lstore_base2_26_avx2 |
| + jz .Lstore_base2_26_avx2$suffix |
| |
| vmovd %rax#d,%x#$H0 |
| vmovd %rdx#d,%x#$H1 |
| vmovd $h0#d,%x#$H2 |
| vmovd $h1#d,%x#$H3 |
| vmovd $h2#d,%x#$H4 |
| - jmp .Lproceed_avx2 |
| + jmp .Lproceed_avx2$suffix |
| |
| .align 32 |
| -.Lstore_base2_64_avx2: |
| +.Lstore_base2_64_avx2$suffix: |
| mov $h0,0($ctx) |
| mov $h1,8($ctx) |
| mov $h2,16($ctx) # note that is_base2_26 is zeroed |
| - jmp .Ldone_avx2 |
| + jmp .Ldone_avx2$suffix |
| |
| .align 16 |
| -.Lstore_base2_26_avx2: |
| +.Lstore_base2_26_avx2$suffix: |
| mov %rax#d,0($ctx) # store hash value base 2^26 |
| mov %rdx#d,4($ctx) |
| mov $h0#d,8($ctx) |
| mov $h1#d,12($ctx) |
| mov $h2#d,16($ctx) |
| .align 16 |
| -.Ldone_avx2: |
| - mov 0(%rsp),%r15 |
| +.Ldone_avx2$suffix: |
| + pop %r15 |
| .cfi_restore %r15 |
| - mov 8(%rsp),%r14 |
| + pop %r14 |
| .cfi_restore %r14 |
| - mov 16(%rsp),%r13 |
| + pop %r13 |
| .cfi_restore %r13 |
| - mov 24(%rsp),%r12 |
| + pop %r12 |
| .cfi_restore %r12 |
| - mov 32(%rsp),%rbp |
| -.cfi_restore %rbp |
| - mov 40(%rsp),%rbx |
| + pop %rbx |
| .cfi_restore %rbx |
| - lea 48(%rsp),%rsp |
| -.cfi_adjust_cfa_offset -48 |
| -.Lno_data_avx2: |
| -.Lblocks_avx2_epilogue: |
| + pop %rbp |
| +.cfi_restore %rbp |
| +.Lno_data_avx2$suffix: |
| +.Lblocks_avx2_epilogue$suffix: |
| ret |
| .cfi_endproc |
| |
| .align 32 |
| -.Lbase2_64_avx2: |
| +.Lbase2_64_avx2$suffix: |
| .cfi_startproc |
| - push %rbx |
| -.cfi_push %rbx |
| push %rbp |
| .cfi_push %rbp |
| + mov %rsp,%rbp |
| + push %rbx |
| +.cfi_push %rbx |
| push %r12 |
| .cfi_push %r12 |
| push %r13 |
| @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2: |
| .cfi_push %r14 |
| push %r15 |
| .cfi_push %r15 |
| -.Lbase2_64_avx2_body: |
| +.Lbase2_64_avx2_body$suffix: |
| |
| mov $len,%r15 # reassign $len |
| |
| @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2: |
| add $r1,$s1 # s1 = r1 + (r1 >> 2) |
| |
| test \$63,$len |
| - jz .Linit_avx2 |
| + jz .Linit_avx2$suffix |
| |
| -.Lbase2_64_pre_avx2: |
| +.Lbase2_64_pre_avx2$suffix: |
| add 0($inp),$h0 # accumulate input |
| adc 8($inp),$h1 |
| lea 16($inp),$inp |
| @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2: |
| mov $r1,%rax |
| |
| test \$63,%r15 |
| - jnz .Lbase2_64_pre_avx2 |
| + jnz .Lbase2_64_pre_avx2$suffix |
| |
| -.Linit_avx2: |
| +.Linit_avx2$suffix: |
| ################################# base 2^64 -> base 2^26 |
| mov $h0,%rax |
| mov $h0,%rdx |
| @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2: |
| |
| call __poly1305_init_avx |
| |
| -.Lproceed_avx2: |
| +.Lproceed_avx2$suffix: |
| mov %r15,$len # restore $len |
| - mov OPENSSL_ia32cap_P+8(%rip),%r10d |
| +___ |
| +$code.=<<___ if (!$kernel); |
| + mov OPENSSL_ia32cap_P+8(%rip),%r9d |
| mov \$`(1<<31|1<<30|1<<16)`,%r11d |
| - |
| - mov 0(%rsp),%r15 |
| +___ |
| +$code.=<<___; |
| + pop %r15 |
| .cfi_restore %r15 |
| - mov 8(%rsp),%r14 |
| + pop %r14 |
| .cfi_restore %r14 |
| - mov 16(%rsp),%r13 |
| + pop %r13 |
| .cfi_restore %r13 |
| - mov 24(%rsp),%r12 |
| + pop %r12 |
| .cfi_restore %r12 |
| - mov 32(%rsp),%rbp |
| -.cfi_restore %rbp |
| - mov 40(%rsp),%rbx |
| + pop %rbx |
| .cfi_restore %rbx |
| - lea 48(%rsp),%rax |
| - lea 48(%rsp),%rsp |
| -.cfi_adjust_cfa_offset -48 |
| -.Lbase2_64_avx2_epilogue: |
| - jmp .Ldo_avx2 |
| + pop %rbp |
| +.cfi_restore %rbp |
| +.Lbase2_64_avx2_epilogue$suffix: |
| + jmp .Ldo_avx2$suffix |
| .cfi_endproc |
| |
| .align 32 |
| -.Leven_avx2: |
| +.Leven_avx2$suffix: |
| .cfi_startproc |
| - mov OPENSSL_ia32cap_P+8(%rip),%r10d |
| +___ |
| +$code.=<<___ if (!$kernel); |
| + mov OPENSSL_ia32cap_P+8(%rip),%r9d |
| +___ |
| +$code.=<<___; |
| vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 |
| vmovd 4*1($ctx),%x#$H1 |
| vmovd 4*2($ctx),%x#$H2 |
| vmovd 4*3($ctx),%x#$H3 |
| vmovd 4*4($ctx),%x#$H4 |
| |
| -.Ldo_avx2: |
| +.Ldo_avx2$suffix: |
| ___ |
| -$code.=<<___ if ($avx>2); |
| +$code.=<<___ if (!$kernel && $avx>2); |
| cmp \$512,$len |
| jb .Lskip_avx512 |
| - and %r11d,%r10d |
| - test \$`1<<16`,%r10d # check for AVX512F |
| + and %r11d,%r9d |
| + test \$`1<<16`,%r9d # check for AVX512F |
| jnz .Lblocks_avx512 |
| -.Lskip_avx512: |
| +.Lskip_avx512$suffix: |
| +___ |
| +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); |
| + cmp \$512,$len |
| + jae .Lblocks_avx512 |
| ___ |
| $code.=<<___ if (!$win64); |
| - lea -8(%rsp),%r11 |
| -.cfi_def_cfa %r11,16 |
| + lea 8(%rsp),%r10 |
| +.cfi_def_cfa_register %r10 |
| sub \$0x128,%rsp |
| ___ |
| $code.=<<___ if ($win64); |
| - lea -0xf8(%rsp),%r11 |
| + lea 8(%rsp),%r10 |
| sub \$0x1c8,%rsp |
| - vmovdqa %xmm6,0x50(%r11) |
| - vmovdqa %xmm7,0x60(%r11) |
| - vmovdqa %xmm8,0x70(%r11) |
| - vmovdqa %xmm9,0x80(%r11) |
| - vmovdqa %xmm10,0x90(%r11) |
| - vmovdqa %xmm11,0xa0(%r11) |
| - vmovdqa %xmm12,0xb0(%r11) |
| - vmovdqa %xmm13,0xc0(%r11) |
| - vmovdqa %xmm14,0xd0(%r11) |
| - vmovdqa %xmm15,0xe0(%r11) |
| -.Ldo_avx2_body: |
| + vmovdqa %xmm6,-0xb0(%r10) |
| + vmovdqa %xmm7,-0xa0(%r10) |
| + vmovdqa %xmm8,-0x90(%r10) |
| + vmovdqa %xmm9,-0x80(%r10) |
| + vmovdqa %xmm10,-0x70(%r10) |
| + vmovdqa %xmm11,-0x60(%r10) |
| + vmovdqa %xmm12,-0x50(%r10) |
| + vmovdqa %xmm13,-0x40(%r10) |
| + vmovdqa %xmm14,-0x30(%r10) |
| + vmovdqa %xmm15,-0x20(%r10) |
| +.Ldo_avx2_body$suffix: |
| ___ |
| $code.=<<___; |
| lea .Lconst(%rip),%rcx |
| @@ -1794,11 +1901,11 @@ $code.=<<___; |
| |
| vpaddq $H2,$T2,$H2 # accumulate input |
| sub \$64,$len |
| - jz .Ltail_avx2 |
| - jmp .Loop_avx2 |
| + jz .Ltail_avx2$suffix |
| + jmp .Loop_avx2$suffix |
| |
| .align 32 |
| -.Loop_avx2: |
| +.Loop_avx2$suffix: |
| ################################################################ |
| # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 |
| # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 |
| @@ -1946,10 +2053,10 @@ $code.=<<___; |
| vpor 32(%rcx),$T4,$T4 # padbit, yes, always |
| |
| sub \$64,$len |
| - jnz .Loop_avx2 |
| + jnz .Loop_avx2$suffix |
| |
| .byte 0x66,0x90 |
| -.Ltail_avx2: |
| +.Ltail_avx2$suffix: |
| ################################################################ |
| # while above multiplications were by r^4 in all lanes, in last |
| # iteration we multiply least significant lane by r^4 and most |
| @@ -2087,37 +2194,29 @@ $code.=<<___; |
| vmovd %x#$H4,`4*4-48-64`($ctx) |
| ___ |
| $code.=<<___ if ($win64); |
| - vmovdqa 0x50(%r11),%xmm6 |
| - vmovdqa 0x60(%r11),%xmm7 |
| - vmovdqa 0x70(%r11),%xmm8 |
| - vmovdqa 0x80(%r11),%xmm9 |
| - vmovdqa 0x90(%r11),%xmm10 |
| - vmovdqa 0xa0(%r11),%xmm11 |
| - vmovdqa 0xb0(%r11),%xmm12 |
| - vmovdqa 0xc0(%r11),%xmm13 |
| - vmovdqa 0xd0(%r11),%xmm14 |
| - vmovdqa 0xe0(%r11),%xmm15 |
| - lea 0xf8(%r11),%rsp |
| -.Ldo_avx2_epilogue: |
| + vmovdqa -0xb0(%r10),%xmm6 |
| + vmovdqa -0xa0(%r10),%xmm7 |
| + vmovdqa -0x90(%r10),%xmm8 |
| + vmovdqa -0x80(%r10),%xmm9 |
| + vmovdqa -0x70(%r10),%xmm10 |
| + vmovdqa -0x60(%r10),%xmm11 |
| + vmovdqa -0x50(%r10),%xmm12 |
| + vmovdqa -0x40(%r10),%xmm13 |
| + vmovdqa -0x30(%r10),%xmm14 |
| + vmovdqa -0x20(%r10),%xmm15 |
| + lea -8(%r10),%rsp |
| +.Ldo_avx2_epilogue$suffix: |
| ___ |
| $code.=<<___ if (!$win64); |
| - lea 8(%r11),%rsp |
| -.cfi_def_cfa %rsp,8 |
| + lea -8(%r10),%rsp |
| +.cfi_def_cfa_register %rsp |
| ___ |
| $code.=<<___; |
| vzeroupper |
| ret |
| .cfi_endproc |
| -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 |
| ___ |
| -####################################################################### |
| -if ($avx>2) { |
| -# On entry we have input length divisible by 64. But since inner loop |
| -# processes 128 bytes per iteration, cases when length is not divisible |
| -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this |
| -# reason stack layout is kept identical to poly1305_blocks_avx2. If not |
| -# for this tail, we wouldn't have to even allocate stack frame... |
| - |
| +if($avx > 2 && $avx512) { |
| my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); |
| my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); |
| my $PADBIT="%zmm30"; |
| @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); |
| map(s/%y/%z/,($MASK)); |
| |
| $code.=<<___; |
| -.type poly1305_blocks_avx512,\@function,4 |
| -.align 32 |
| -poly1305_blocks_avx512: |
| .cfi_startproc |
| .Lblocks_avx512: |
| mov \$15,%eax |
| kmovw %eax,%k2 |
| ___ |
| $code.=<<___ if (!$win64); |
| - lea -8(%rsp),%r11 |
| -.cfi_def_cfa %r11,16 |
| + lea 8(%rsp),%r10 |
| +.cfi_def_cfa_register %r10 |
| sub \$0x128,%rsp |
| ___ |
| $code.=<<___ if ($win64); |
| - lea -0xf8(%rsp),%r11 |
| + lea 8(%rsp),%r10 |
| sub \$0x1c8,%rsp |
| - vmovdqa %xmm6,0x50(%r11) |
| - vmovdqa %xmm7,0x60(%r11) |
| - vmovdqa %xmm8,0x70(%r11) |
| - vmovdqa %xmm9,0x80(%r11) |
| - vmovdqa %xmm10,0x90(%r11) |
| - vmovdqa %xmm11,0xa0(%r11) |
| - vmovdqa %xmm12,0xb0(%r11) |
| - vmovdqa %xmm13,0xc0(%r11) |
| - vmovdqa %xmm14,0xd0(%r11) |
| - vmovdqa %xmm15,0xe0(%r11) |
| + vmovdqa %xmm6,-0xb0(%r10) |
| + vmovdqa %xmm7,-0xa0(%r10) |
| + vmovdqa %xmm8,-0x90(%r10) |
| + vmovdqa %xmm9,-0x80(%r10) |
| + vmovdqa %xmm10,-0x70(%r10) |
| + vmovdqa %xmm11,-0x60(%r10) |
| + vmovdqa %xmm12,-0x50(%r10) |
| + vmovdqa %xmm13,-0x40(%r10) |
| + vmovdqa %xmm14,-0x30(%r10) |
| + vmovdqa %xmm15,-0x20(%r10) |
| .Ldo_avx512_body: |
| ___ |
| $code.=<<___; |
| @@ -2679,7 +2775,7 @@ $code.=<<___; |
| |
| lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 |
| add \$64,$len |
| - jnz .Ltail_avx2 |
| + jnz .Ltail_avx2$suffix |
| |
| vpsubq $T2,$H2,$H2 # undo input accumulation |
| vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced |
| @@ -2690,29 +2786,61 @@ $code.=<<___; |
| vzeroall |
| ___ |
| $code.=<<___ if ($win64); |
| - movdqa 0x50(%r11),%xmm6 |
| - movdqa 0x60(%r11),%xmm7 |
| - movdqa 0x70(%r11),%xmm8 |
| - movdqa 0x80(%r11),%xmm9 |
| - movdqa 0x90(%r11),%xmm10 |
| - movdqa 0xa0(%r11),%xmm11 |
| - movdqa 0xb0(%r11),%xmm12 |
| - movdqa 0xc0(%r11),%xmm13 |
| - movdqa 0xd0(%r11),%xmm14 |
| - movdqa 0xe0(%r11),%xmm15 |
| - lea 0xf8(%r11),%rsp |
| + movdqa -0xb0(%r10),%xmm6 |
| + movdqa -0xa0(%r10),%xmm7 |
| + movdqa -0x90(%r10),%xmm8 |
| + movdqa -0x80(%r10),%xmm9 |
| + movdqa -0x70(%r10),%xmm10 |
| + movdqa -0x60(%r10),%xmm11 |
| + movdqa -0x50(%r10),%xmm12 |
| + movdqa -0x40(%r10),%xmm13 |
| + movdqa -0x30(%r10),%xmm14 |
| + movdqa -0x20(%r10),%xmm15 |
| + lea -8(%r10),%rsp |
| .Ldo_avx512_epilogue: |
| ___ |
| $code.=<<___ if (!$win64); |
| - lea 8(%r11),%rsp |
| -.cfi_def_cfa %rsp,8 |
| + lea -8(%r10),%rsp |
| +.cfi_def_cfa_register %rsp |
| ___ |
| $code.=<<___; |
| ret |
| .cfi_endproc |
| -.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 |
| ___ |
| -if ($avx>3) { |
| + |
| +} |
| + |
| +} |
| + |
| +&declare_function("poly1305_blocks_avx2", 32, 4); |
| +poly1305_blocks_avxN(0); |
| +&end_function("poly1305_blocks_avx2"); |
| + |
| +if($kernel) { |
| + $code .= "#endif\n"; |
| +} |
| + |
| +####################################################################### |
| +if ($avx>2) { |
| +# On entry we have input length divisible by 64. But since inner loop |
| +# processes 128 bytes per iteration, cases when length is not divisible |
| +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this |
| +# reason stack layout is kept identical to poly1305_blocks_avx2. If not |
| +# for this tail, we wouldn't have to even allocate stack frame... |
| + |
| +if($kernel) { |
| + $code .= "#ifdef CONFIG_AS_AVX512\n"; |
| +} |
| + |
| +&declare_function("poly1305_blocks_avx512", 32, 4); |
| +poly1305_blocks_avxN(1); |
| +&end_function("poly1305_blocks_avx512"); |
| + |
| +if ($kernel) { |
| + $code .= "#endif\n"; |
| +} |
| + |
| +if (!$kernel && $avx>3) { |
| ######################################################################## |
| # VPMADD52 version using 2^44 radix. |
| # |
| @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44: |
| .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 |
| ___ |
| } } } |
| -$code.=<<___; |
| -.align 64 |
| -.Lconst: |
| -.Lmask24: |
| -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 |
| -.L129: |
| -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 |
| -.Lmask26: |
| -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 |
| -.Lpermd_avx2: |
| -.long 2,2,2,3,2,0,2,1 |
| -.Lpermd_avx512: |
| -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 |
| - |
| -.L2_44_inp_permd: |
| -.long 0,1,1,2,2,3,7,7 |
| -.L2_44_inp_shift: |
| -.quad 0,12,24,64 |
| -.L2_44_mask: |
| -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff |
| -.L2_44_shift_rgt: |
| -.quad 44,44,42,64 |
| -.L2_44_shift_lft: |
| -.quad 8,8,10,64 |
| - |
| -.align 64 |
| -.Lx_mask44: |
| -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| -.Lx_mask42: |
| -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| -___ |
| } |
| -$code.=<<___; |
| -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| -.align 16 |
| -___ |
| |
| +if (!$kernel) |
| { # chacha20-poly1305 helpers |
| my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order |
| ("%rdi","%rsi","%rdx","%rcx"); # Unix order |
| @@ -4038,17 +4130,17 @@ avx_handler: |
| |
| .section .pdata |
| .align 4 |
| - .rva .LSEH_begin_poly1305_init |
| - .rva .LSEH_end_poly1305_init |
| - .rva .LSEH_info_poly1305_init |
| - |
| - .rva .LSEH_begin_poly1305_blocks |
| - .rva .LSEH_end_poly1305_blocks |
| - .rva .LSEH_info_poly1305_blocks |
| - |
| - .rva .LSEH_begin_poly1305_emit |
| - .rva .LSEH_end_poly1305_emit |
| - .rva .LSEH_info_poly1305_emit |
| + .rva .LSEH_begin_poly1305_init_x86_64 |
| + .rva .LSEH_end_poly1305_init_x86_64 |
| + .rva .LSEH_info_poly1305_init_x86_64 |
| + |
| + .rva .LSEH_begin_poly1305_blocks_x86_64 |
| + .rva .LSEH_end_poly1305_blocks_x86_64 |
| + .rva .LSEH_info_poly1305_blocks_x86_64 |
| + |
| + .rva .LSEH_begin_poly1305_emit_x86_64 |
| + .rva .LSEH_end_poly1305_emit_x86_64 |
| + .rva .LSEH_info_poly1305_emit_x86_64 |
| ___ |
| $code.=<<___ if ($avx); |
| .rva .LSEH_begin_poly1305_blocks_avx |
| @@ -4088,20 +4180,20 @@ ___ |
| $code.=<<___; |
| .section .xdata |
| .align 8 |
| -.LSEH_info_poly1305_init: |
| +.LSEH_info_poly1305_init_x86_64: |
| .byte 9,0,0,0 |
| .rva se_handler |
| - .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init |
| + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 |
| |
| -.LSEH_info_poly1305_blocks: |
| +.LSEH_info_poly1305_blocks_x86_64: |
| .byte 9,0,0,0 |
| .rva se_handler |
| .rva .Lblocks_body,.Lblocks_epilogue |
| |
| -.LSEH_info_poly1305_emit: |
| +.LSEH_info_poly1305_emit_x86_64: |
| .byte 9,0,0,0 |
| .rva se_handler |
| - .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit |
| + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 |
| ___ |
| $code.=<<___ if ($avx); |
| .LSEH_info_poly1305_blocks_avx_1: |
| @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2); |
| ___ |
| } |
| |
| +open SELF,$0; |
| +while(<SELF>) { |
| + next if (/^#!/); |
| + last if (!s/^#/\/\// and !/^$/); |
| + print; |
| +} |
| +close SELF; |
| + |
| foreach (split('\n',$code)) { |
| s/\`([^\`]*)\`/eval($1)/ge; |
| s/%r([a-z]+)#d/%e$1/g; |
| s/%r([0-9]+)#d/%r$1d/g; |
| s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; |
| |
| + if ($kernel) { |
| + s/(^\.type.*),[0-9]+$/\1/; |
| + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; |
| + next if /^\.cfi.*/; |
| + } |
| + |
| print $_,"\n"; |
| } |
| close STDOUT; |
| --- a/arch/x86/crypto/poly1305_glue.c |
| +++ b/arch/x86/crypto/poly1305_glue.c |
| @@ -1,8 +1,6 @@ |
| -// SPDX-License-Identifier: GPL-2.0-or-later |
| +// SPDX-License-Identifier: GPL-2.0 OR MIT |
| /* |
| - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code |
| - * |
| - * Copyright (C) 2015 Martin Willi |
| + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| */ |
| |
| #include <crypto/algapi.h> |
| @@ -13,279 +11,170 @@ |
| #include <linux/jump_label.h> |
| #include <linux/kernel.h> |
| #include <linux/module.h> |
| +#include <asm/intel-family.h> |
| #include <asm/simd.h> |
| |
| -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, |
| - const u32 *r, unsigned int blocks); |
| -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, |
| - unsigned int blocks, const u32 *u); |
| -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, |
| - unsigned int blocks, const u32 *u); |
| +asmlinkage void poly1305_init_x86_64(void *ctx, |
| + const u8 key[POLY1305_KEY_SIZE]); |
| +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, |
| + const size_t len, const u32 padbit); |
| +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |
| + const u32 nonce[4]); |
| +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |
| + const u32 nonce[4]); |
| +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, |
| + const u32 padbit); |
| +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, |
| + const u32 padbit); |
| +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, |
| + const size_t len, const u32 padbit); |
| |
| -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); |
| static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); |
| |
| -static inline u64 mlt(u64 a, u64 b) |
| -{ |
| - return a * b; |
| -} |
| - |
| -static inline u32 sr(u64 v, u_char n) |
| -{ |
| - return v >> n; |
| -} |
| - |
| -static inline u32 and(u32 v, u32 mask) |
| -{ |
| - return v & mask; |
| -} |
| - |
| -static void poly1305_simd_mult(u32 *a, const u32 *b) |
| -{ |
| - u8 m[POLY1305_BLOCK_SIZE]; |
| - |
| - memset(m, 0, sizeof(m)); |
| - /* The poly1305 block function adds a hi-bit to the accumulator which |
| - * we don't need for key multiplication; compensate for it. */ |
| - a[4] -= 1 << 24; |
| - poly1305_block_sse2(a, m, b, 1); |
| -} |
| - |
| -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key) |
| -{ |
| - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ |
| - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; |
| - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; |
| - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; |
| - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; |
| - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; |
| -} |
| +struct poly1305_arch_internal { |
| + union { |
| + struct { |
| + u32 h[5]; |
| + u32 is_base2_26; |
| + }; |
| + u64 hs[3]; |
| + }; |
| + u64 r[2]; |
| + u64 pad; |
| + struct { u32 r2, r1, r4, r3; } rn[9]; |
| +}; |
| |
| -static void poly1305_integer_blocks(struct poly1305_state *state, |
| - const struct poly1305_key *key, |
| - const void *src, |
| - unsigned int nblocks, u32 hibit) |
| +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit |
| + * the unfortunate situation of using AVX and then having to go back to scalar |
| + * -- because the user is silly and has called the update function from two |
| + * separate contexts -- then we need to convert back to the original base before |
| + * proceeding. It is possible to reason that the initial reduction below is |
| + * sufficient given the implementation invariants. However, for an avoidance of |
| + * doubt and because this is not performance critical, we do the full reduction |
| + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py |
| + */ |
| +static void convert_to_base2_64(void *ctx) |
| { |
| - u32 r0, r1, r2, r3, r4; |
| - u32 s1, s2, s3, s4; |
| - u32 h0, h1, h2, h3, h4; |
| - u64 d0, d1, d2, d3, d4; |
| + struct poly1305_arch_internal *state = ctx; |
| + u32 cy; |
| |
| - if (!nblocks) |
| + if (!state->is_base2_26) |
| return; |
| |
| - r0 = key->r[0]; |
| - r1 = key->r[1]; |
| - r2 = key->r[2]; |
| - r3 = key->r[3]; |
| - r4 = key->r[4]; |
| - |
| - s1 = r1 * 5; |
| - s2 = r2 * 5; |
| - s3 = r3 * 5; |
| - s4 = r4 * 5; |
| - |
| - h0 = state->h[0]; |
| - h1 = state->h[1]; |
| - h2 = state->h[2]; |
| - h3 = state->h[3]; |
| - h4 = state->h[4]; |
| - |
| - do { |
| - /* h += m[i] */ |
| - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; |
| - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; |
| - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; |
| - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; |
| - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); |
| - |
| - /* h *= r */ |
| - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + |
| - mlt(h3, s2) + mlt(h4, s1); |
| - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + |
| - mlt(h3, s3) + mlt(h4, s2); |
| - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + |
| - mlt(h3, s4) + mlt(h4, s3); |
| - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + |
| - mlt(h3, r0) + mlt(h4, s4); |
| - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + |
| - mlt(h3, r1) + mlt(h4, r0); |
| - |
| - /* (partial) h %= p */ |
| - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); |
| - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); |
| - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); |
| - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); |
| - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); |
| - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; |
| - |
| - src += POLY1305_BLOCK_SIZE; |
| - } while (--nblocks); |
| - |
| - state->h[0] = h0; |
| - state->h[1] = h1; |
| - state->h[2] = h2; |
| - state->h[3] = h3; |
| - state->h[4] = h4; |
| -} |
| - |
| -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst) |
| -{ |
| - u32 h0, h1, h2, h3, h4; |
| - u32 g0, g1, g2, g3, g4; |
| - u32 mask; |
| - |
| - /* fully carry h */ |
| - h0 = state->h[0]; |
| - h1 = state->h[1]; |
| - h2 = state->h[2]; |
| - h3 = state->h[3]; |
| - h4 = state->h[4]; |
| - |
| - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; |
| - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; |
| - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; |
| - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; |
| - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; |
| - |
| - /* compute h + -p */ |
| - g0 = h0 + 5; |
| - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; |
| - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; |
| - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; |
| - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; |
| - |
| - /* select h if h < p, or h + -p if h >= p */ |
| - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; |
| - g0 &= mask; |
| - g1 &= mask; |
| - g2 &= mask; |
| - g3 &= mask; |
| - g4 &= mask; |
| - mask = ~mask; |
| - h0 = (h0 & mask) | g0; |
| - h1 = (h1 & mask) | g1; |
| - h2 = (h2 & mask) | g2; |
| - h3 = (h3 & mask) | g3; |
| - h4 = (h4 & mask) | g4; |
| - |
| - /* h = h % (2^128) */ |
| - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); |
| - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); |
| - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); |
| - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); |
| -} |
| - |
| -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) |
| -{ |
| - poly1305_integer_setkey(desc->opaque_r, key); |
| - desc->s[0] = get_unaligned_le32(key + 16); |
| - desc->s[1] = get_unaligned_le32(key + 20); |
| - desc->s[2] = get_unaligned_le32(key + 24); |
| - desc->s[3] = get_unaligned_le32(key + 28); |
| - poly1305_core_init(&desc->h); |
| - desc->buflen = 0; |
| - desc->sset = true; |
| - desc->rset = 1; |
| -} |
| -EXPORT_SYMBOL_GPL(poly1305_init_arch); |
| - |
| -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, |
| - const u8 *src, unsigned int srclen) |
| -{ |
| - if (!dctx->sset) { |
| - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { |
| - poly1305_integer_setkey(dctx->r, src); |
| - src += POLY1305_BLOCK_SIZE; |
| - srclen -= POLY1305_BLOCK_SIZE; |
| - dctx->rset = 1; |
| - } |
| - if (srclen >= POLY1305_BLOCK_SIZE) { |
| - dctx->s[0] = get_unaligned_le32(src + 0); |
| - dctx->s[1] = get_unaligned_le32(src + 4); |
| - dctx->s[2] = get_unaligned_le32(src + 8); |
| - dctx->s[3] = get_unaligned_le32(src + 12); |
| - src += POLY1305_BLOCK_SIZE; |
| - srclen -= POLY1305_BLOCK_SIZE; |
| - dctx->sset = true; |
| - } |
| + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; |
| + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; |
| + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; |
| + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; |
| + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; |
| + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); |
| + state->hs[2] = state->h[4] >> 24; |
| +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) |
| + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); |
| + state->hs[2] &= 3; |
| + state->hs[0] += cy; |
| + state->hs[1] += (cy = ULT(state->hs[0], cy)); |
| + state->hs[2] += ULT(state->hs[1], cy); |
| +#undef ULT |
| + state->is_base2_26 = 0; |
| +} |
| + |
| +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) |
| +{ |
| + poly1305_init_x86_64(ctx, key); |
| +} |
| + |
| +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, |
| + const u32 padbit) |
| +{ |
| + struct poly1305_arch_internal *state = ctx; |
| + |
| + /* SIMD disables preemption, so relax after processing each page. */ |
| + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || |
| + PAGE_SIZE % POLY1305_BLOCK_SIZE); |
| + |
| + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || |
| + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || |
| + !crypto_simd_usable()) { |
| + convert_to_base2_64(ctx); |
| + poly1305_blocks_x86_64(ctx, inp, len, padbit); |
| + return; |
| } |
| - return srclen; |
| -} |
| |
| -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, |
| - const u8 *src, unsigned int srclen) |
| -{ |
| - unsigned int datalen; |
| + for (;;) { |
| + const size_t bytes = min_t(size_t, len, PAGE_SIZE); |
| |
| - if (unlikely(!dctx->sset)) { |
| - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); |
| - src += srclen - datalen; |
| - srclen = datalen; |
| - } |
| - if (srclen >= POLY1305_BLOCK_SIZE) { |
| - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src, |
| - srclen / POLY1305_BLOCK_SIZE, 1); |
| - srclen %= POLY1305_BLOCK_SIZE; |
| + kernel_fpu_begin(); |
| + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) |
| + poly1305_blocks_avx512(ctx, inp, bytes, padbit); |
| + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) |
| + poly1305_blocks_avx2(ctx, inp, bytes, padbit); |
| + else |
| + poly1305_blocks_avx(ctx, inp, bytes, padbit); |
| + kernel_fpu_end(); |
| + len -= bytes; |
| + if (!len) |
| + break; |
| + inp += bytes; |
| } |
| - return srclen; |
| } |
| |
| -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, |
| - const u8 *src, unsigned int srclen) |
| -{ |
| - unsigned int blocks, datalen; |
| +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |
| + const u32 nonce[4]) |
| +{ |
| + struct poly1305_arch_internal *state = ctx; |
| + |
| + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || |
| + !state->is_base2_26 || !crypto_simd_usable()) { |
| + convert_to_base2_64(ctx); |
| + poly1305_emit_x86_64(ctx, mac, nonce); |
| + } else |
| + poly1305_emit_avx(ctx, mac, nonce); |
| +} |
| + |
| +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) |
| +{ |
| + poly1305_simd_init(&dctx->h, key); |
| + dctx->s[0] = get_unaligned_le32(&key[16]); |
| + dctx->s[1] = get_unaligned_le32(&key[20]); |
| + dctx->s[2] = get_unaligned_le32(&key[24]); |
| + dctx->s[3] = get_unaligned_le32(&key[28]); |
| + dctx->buflen = 0; |
| + dctx->sset = true; |
| +} |
| +EXPORT_SYMBOL(poly1305_init_arch); |
| |
| +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, |
| + const u8 *inp, unsigned int len) |
| +{ |
| + unsigned int acc = 0; |
| if (unlikely(!dctx->sset)) { |
| - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); |
| - src += srclen - datalen; |
| - srclen = datalen; |
| - } |
| - |
| - if (IS_ENABLED(CONFIG_AS_AVX2) && |
| - static_branch_likely(&poly1305_use_avx2) && |
| - srclen >= POLY1305_BLOCK_SIZE * 4) { |
| - if (unlikely(dctx->rset < 4)) { |
| - if (dctx->rset < 2) { |
| - dctx->r[1] = dctx->r[0]; |
| - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); |
| - } |
| - dctx->r[2] = dctx->r[1]; |
| - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); |
| - dctx->r[3] = dctx->r[2]; |
| - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); |
| - dctx->rset = 4; |
| + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { |
| + poly1305_simd_init(&dctx->h, inp); |
| + inp += POLY1305_BLOCK_SIZE; |
| + len -= POLY1305_BLOCK_SIZE; |
| + acc += POLY1305_BLOCK_SIZE; |
| + dctx->rset = 1; |
| } |
| - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); |
| - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, |
| - dctx->r[1].r); |
| - src += POLY1305_BLOCK_SIZE * 4 * blocks; |
| - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; |
| - } |
| - |
| - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { |
| - if (unlikely(dctx->rset < 2)) { |
| - dctx->r[1] = dctx->r[0]; |
| - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); |
| - dctx->rset = 2; |
| + if (len >= POLY1305_BLOCK_SIZE) { |
| + dctx->s[0] = get_unaligned_le32(&inp[0]); |
| + dctx->s[1] = get_unaligned_le32(&inp[4]); |
| + dctx->s[2] = get_unaligned_le32(&inp[8]); |
| + dctx->s[3] = get_unaligned_le32(&inp[12]); |
| + inp += POLY1305_BLOCK_SIZE; |
| + len -= POLY1305_BLOCK_SIZE; |
| + acc += POLY1305_BLOCK_SIZE; |
| + dctx->sset = true; |
| } |
| - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); |
| - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, |
| - blocks, dctx->r[1].r); |
| - src += POLY1305_BLOCK_SIZE * 2 * blocks; |
| - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; |
| - } |
| - if (srclen >= POLY1305_BLOCK_SIZE) { |
| - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); |
| - srclen -= POLY1305_BLOCK_SIZE; |
| } |
| - return srclen; |
| + return acc; |
| } |
| |
| void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, |
| unsigned int srclen) |
| { |
| - unsigned int bytes; |
| + unsigned int bytes, used; |
| |
| if (unlikely(dctx->buflen)) { |
| bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); |
| @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130 |
| dctx->buflen += bytes; |
| |
| if (dctx->buflen == POLY1305_BLOCK_SIZE) { |
| - if (static_branch_likely(&poly1305_use_simd) && |
| - likely(crypto_simd_usable())) { |
| - kernel_fpu_begin(); |
| - poly1305_simd_blocks(dctx, dctx->buf, |
| - POLY1305_BLOCK_SIZE); |
| - kernel_fpu_end(); |
| - } else { |
| - poly1305_scalar_blocks(dctx, dctx->buf, |
| - POLY1305_BLOCK_SIZE); |
| - } |
| + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) |
| + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); |
| dctx->buflen = 0; |
| } |
| } |
| |
| if (likely(srclen >= POLY1305_BLOCK_SIZE)) { |
| - if (static_branch_likely(&poly1305_use_simd) && |
| - likely(crypto_simd_usable())) { |
| - kernel_fpu_begin(); |
| - bytes = poly1305_simd_blocks(dctx, src, srclen); |
| - kernel_fpu_end(); |
| - } else { |
| - bytes = poly1305_scalar_blocks(dctx, src, srclen); |
| - } |
| - src += srclen - bytes; |
| - srclen = bytes; |
| + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); |
| + srclen -= bytes; |
| + used = crypto_poly1305_setdctxkey(dctx, src, bytes); |
| + if (likely(bytes - used)) |
| + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); |
| + src += bytes; |
| } |
| |
| if (unlikely(srclen)) { |
| @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130 |
| } |
| EXPORT_SYMBOL(poly1305_update_arch); |
| |
| -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst) |
| +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) |
| { |
| - __le32 digest[4]; |
| - u64 f = 0; |
| - |
| - if (unlikely(desc->buflen)) { |
| - desc->buf[desc->buflen++] = 1; |
| - memset(desc->buf + desc->buflen, 0, |
| - POLY1305_BLOCK_SIZE - desc->buflen); |
| - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0); |
| + if (unlikely(dctx->buflen)) { |
| + dctx->buf[dctx->buflen++] = 1; |
| + memset(dctx->buf + dctx->buflen, 0, |
| + POLY1305_BLOCK_SIZE - dctx->buflen); |
| + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); |
| } |
| |
| - poly1305_integer_emit(&desc->h, digest); |
| - |
| - /* mac = (h + s) % (2^128) */ |
| - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; |
| - put_unaligned_le32(f, dst + 0); |
| - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; |
| - put_unaligned_le32(f, dst + 4); |
| - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; |
| - put_unaligned_le32(f, dst + 8); |
| - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; |
| - put_unaligned_le32(f, dst + 12); |
| - |
| - *desc = (struct poly1305_desc_ctx){}; |
| + poly1305_simd_emit(&dctx->h, dst, dctx->s); |
| + *dctx = (struct poly1305_desc_ctx){}; |
| } |
| EXPORT_SYMBOL(poly1305_final_arch); |
| |
| @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s |
| { |
| struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); |
| |
| - poly1305_core_init(&dctx->h); |
| - dctx->buflen = 0; |
| - dctx->rset = 0; |
| - dctx->sset = false; |
| - |
| + *dctx = (struct poly1305_desc_ctx){}; |
| return 0; |
| } |
| |
| -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) |
| +static int crypto_poly1305_update(struct shash_desc *desc, |
| + const u8 *src, unsigned int srclen) |
| { |
| struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); |
| |
| - if (unlikely(!dctx->sset)) |
| - return -ENOKEY; |
| - |
| - poly1305_final_arch(dctx, dst); |
| + poly1305_update_arch(dctx, src, srclen); |
| return 0; |
| } |
| |
| -static int poly1305_simd_update(struct shash_desc *desc, |
| - const u8 *src, unsigned int srclen) |
| +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) |
| { |
| struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); |
| |
| - poly1305_update_arch(dctx, src, srclen); |
| + if (unlikely(!dctx->sset)) |
| + return -ENOKEY; |
| + |
| + poly1305_final_arch(dctx, dst); |
| return 0; |
| } |
| |
| static struct shash_alg alg = { |
| .digestsize = POLY1305_DIGEST_SIZE, |
| .init = crypto_poly1305_init, |
| - .update = poly1305_simd_update, |
| + .update = crypto_poly1305_update, |
| .final = crypto_poly1305_final, |
| .descsize = sizeof(struct poly1305_desc_ctx), |
| .base = { |
| @@ -406,17 +265,19 @@ static struct shash_alg alg = { |
| |
| static int __init poly1305_simd_mod_init(void) |
| { |
| - if (!boot_cpu_has(X86_FEATURE_XMM2)) |
| - return 0; |
| - |
| - static_branch_enable(&poly1305_use_simd); |
| - |
| - if (IS_ENABLED(CONFIG_AS_AVX2) && |
| - boot_cpu_has(X86_FEATURE_AVX) && |
| + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && |
| + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) |
| + static_branch_enable(&poly1305_use_avx); |
| + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && |
| boot_cpu_has(X86_FEATURE_AVX2) && |
| cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) |
| static_branch_enable(&poly1305_use_avx2); |
| - |
| + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && |
| + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && |
| + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && |
| + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ |
| + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) |
| + static_branch_enable(&poly1305_use_avx512); |
| return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; |
| } |
| |
| @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init); |
| module_exit(poly1305_simd_mod_exit); |
| |
| MODULE_LICENSE("GPL"); |
| -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); |
| +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); |
| MODULE_DESCRIPTION("Poly1305 authenticator"); |
| MODULE_ALIAS_CRYPTO("poly1305"); |
| MODULE_ALIAS_CRYPTO("poly1305-simd"); |
| --- a/lib/crypto/Kconfig |
| +++ b/lib/crypto/Kconfig |
| @@ -65,7 +65,7 @@ config CRYPTO_LIB_DES |
| config CRYPTO_LIB_POLY1305_RSIZE |
| int |
| default 2 if MIPS |
| - default 4 if X86_64 |
| + default 11 if X86_64 |
| default 9 if ARM || ARM64 |
| default 1 |
| |