b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | Date: Sun, 5 Jan 2020 22:40:48 -0500 |
| 4 | Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for |
| 5 | kernel |
| 6 | |
| 7 | commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream. |
| 8 | |
| 9 | These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F. |
| 10 | The AVX-512F implementation is disabled on Skylake, due to throttling, |
| 11 | but it is quite fast on >= Cannonlake. |
| 12 | |
| 13 | On the left is cycle counts on a Core i7 6700HQ using the AVX-2 |
| 14 | codepath, comparing this implementation ("new") to the implementation in |
| 15 | the current crypto api ("old"). On the right are benchmarks on a Xeon |
| 16 | Gold 5120 using the AVX-512 codepath. The new implementation is faster |
| 17 | on all benchmarks. |
| 18 | |
| 19 | AVX-2 AVX-512 |
| 20 | --------- ----------- |
| 21 | |
| 22 | size old new size old new |
| 23 | ---- ---- ---- ---- ---- ---- |
| 24 | 0 70 68 0 74 70 |
| 25 | 16 92 90 16 96 92 |
| 26 | 32 134 104 32 136 106 |
| 27 | 48 172 120 48 184 124 |
| 28 | 64 218 136 64 218 138 |
| 29 | 80 254 158 80 260 160 |
| 30 | 96 298 174 96 300 176 |
| 31 | 112 342 192 112 342 194 |
| 32 | 128 388 212 128 384 212 |
| 33 | 144 428 228 144 420 226 |
| 34 | 160 466 246 160 464 248 |
| 35 | 176 510 264 176 504 264 |
| 36 | 192 550 282 192 544 282 |
| 37 | 208 594 302 208 582 300 |
| 38 | 224 628 316 224 624 318 |
| 39 | 240 676 334 240 662 338 |
| 40 | 256 716 354 256 708 358 |
| 41 | 272 764 374 272 748 372 |
| 42 | 288 802 352 288 788 358 |
| 43 | 304 420 366 304 422 370 |
| 44 | 320 428 360 320 432 364 |
| 45 | 336 484 378 336 486 380 |
| 46 | 352 426 384 352 434 390 |
| 47 | 368 478 400 368 480 408 |
| 48 | 384 488 394 384 490 398 |
| 49 | 400 542 408 400 542 412 |
| 50 | 416 486 416 416 492 426 |
| 51 | 432 534 430 432 538 436 |
| 52 | 448 544 422 448 546 432 |
| 53 | 464 600 438 464 600 448 |
| 54 | 480 540 448 480 548 456 |
| 55 | 496 594 464 496 594 476 |
| 56 | 512 602 456 512 606 470 |
| 57 | 528 656 476 528 656 480 |
| 58 | 544 600 480 544 606 498 |
| 59 | 560 650 494 560 652 512 |
| 60 | 576 664 490 576 662 508 |
| 61 | 592 714 508 592 716 522 |
| 62 | 608 656 514 608 664 538 |
| 63 | 624 708 532 624 710 552 |
| 64 | 640 716 524 640 720 516 |
| 65 | 656 770 536 656 772 526 |
| 66 | 672 716 548 672 722 544 |
| 67 | 688 770 562 688 768 556 |
| 68 | 704 774 552 704 778 556 |
| 69 | 720 826 568 720 832 568 |
| 70 | 736 768 574 736 780 584 |
| 71 | 752 822 592 752 826 600 |
| 72 | 768 830 584 768 836 560 |
| 73 | 784 884 602 784 888 572 |
| 74 | 800 828 610 800 838 588 |
| 75 | 816 884 628 816 884 604 |
| 76 | 832 888 618 832 894 598 |
| 77 | 848 942 632 848 946 612 |
| 78 | 864 884 644 864 896 628 |
| 79 | 880 936 660 880 942 644 |
| 80 | 896 948 652 896 952 608 |
| 81 | 912 1000 664 912 1004 616 |
| 82 | 928 942 676 928 954 634 |
| 83 | 944 994 690 944 1000 646 |
| 84 | 960 1002 680 960 1008 646 |
| 85 | 976 1054 694 976 1062 658 |
| 86 | 992 1002 706 992 1012 674 |
| 87 | 1008 1052 720 1008 1058 690 |
| 88 | |
| 89 | This commit wires in the prior implementation from Andy, and makes the |
| 90 | following changes to be suitable for kernel land. |
| 91 | |
| 92 | - Some cosmetic and structural changes, like renaming labels to |
| 93 | .Lname, constants, and other Linux conventions, as well as making |
| 94 | the code easy for us to maintain moving forward. |
| 95 | |
| 96 | - CPU feature checking is done in C by the glue code. |
| 97 | |
| 98 | - We avoid jumping into the middle of functions, to appease objtool, |
| 99 | and instead parameterize shared code. |
| 100 | |
| 101 | - We maintain frame pointers so that stack traces make sense. |
| 102 | |
| 103 | - We remove the dependency on the perl xlate code, which transforms |
| 104 | the output into things that assemblers we don't care about use. |
| 105 | |
| 106 | Importantly, none of our changes affect the arithmetic or core code, but |
| 107 | just involve the differing environment of kernel space. |
| 108 | |
| 109 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 110 | Signed-off-by: Samuel Neves <sneves@dei.uc.pt> |
| 111 | Co-developed-by: Samuel Neves <sneves@dei.uc.pt> |
| 112 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 113 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 114 | --- |
| 115 | arch/x86/crypto/.gitignore | 1 + |
| 116 | arch/x86/crypto/Makefile | 11 +- |
| 117 | arch/x86/crypto/poly1305-avx2-x86_64.S | 390 ---------- |
| 118 | arch/x86/crypto/poly1305-sse2-x86_64.S | 590 --------------- |
| 119 | arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++-------- |
| 120 | arch/x86/crypto/poly1305_glue.c | 473 +++++------- |
| 121 | lib/crypto/Kconfig | 2 +- |
| 122 | 7 files changed, 572 insertions(+), 1577 deletions(-) |
| 123 | create mode 100644 arch/x86/crypto/.gitignore |
| 124 | delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S |
| 125 | delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S |
| 126 | |
| 127 | --- /dev/null |
| 128 | +++ b/arch/x86/crypto/.gitignore |
| 129 | @@ -0,0 +1 @@ |
| 130 | +poly1305-x86_64.S |
| 131 | --- a/arch/x86/crypto/Makefile |
| 132 | +++ b/arch/x86/crypto/Makefile |
| 133 | @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o |
| 134 | |
| 135 | nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o |
| 136 | blake2s-x86_64-y := blake2s-core.o blake2s-glue.o |
| 137 | +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o |
| 138 | +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),) |
| 139 | +targets += poly1305-x86_64-cryptogams.S |
| 140 | +endif |
| 141 | |
| 142 | ifeq ($(avx_supported),yes) |
| 143 | camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ |
| 144 | @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni |
| 145 | aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o |
| 146 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
| 147 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o |
| 148 | -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o |
| 149 | ifeq ($(avx2_supported),yes) |
| 150 | sha1-ssse3-y += sha1_avx2_x86_64_asm.o |
| 151 | -poly1305-x86_64-y += poly1305-avx2-x86_64.o |
| 152 | endif |
| 153 | ifeq ($(sha1_ni_supported),yes) |
| 154 | sha1-ssse3-y += sha1_ni_asm.o |
| 155 | @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o |
| 156 | endif |
| 157 | sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o |
| 158 | crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o |
| 159 | + |
| 160 | +quiet_cmd_perlasm = PERLASM $@ |
| 161 | + cmd_perlasm = $(PERL) $< > $@ |
| 162 | +$(obj)/%.S: $(src)/%.pl FORCE |
| 163 | + $(call if_changed,perlasm) |
| 164 | --- a/arch/x86/crypto/poly1305-avx2-x86_64.S |
| 165 | +++ /dev/null |
| 166 | @@ -1,390 +0,0 @@ |
| 167 | -/* SPDX-License-Identifier: GPL-2.0-or-later */ |
| 168 | -/* |
| 169 | - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions |
| 170 | - * |
| 171 | - * Copyright (C) 2015 Martin Willi |
| 172 | - */ |
| 173 | - |
| 174 | -#include <linux/linkage.h> |
| 175 | - |
| 176 | -.section .rodata.cst32.ANMASK, "aM", @progbits, 32 |
| 177 | -.align 32 |
| 178 | -ANMASK: .octa 0x0000000003ffffff0000000003ffffff |
| 179 | - .octa 0x0000000003ffffff0000000003ffffff |
| 180 | - |
| 181 | -.section .rodata.cst32.ORMASK, "aM", @progbits, 32 |
| 182 | -.align 32 |
| 183 | -ORMASK: .octa 0x00000000010000000000000001000000 |
| 184 | - .octa 0x00000000010000000000000001000000 |
| 185 | - |
| 186 | -.text |
| 187 | - |
| 188 | -#define h0 0x00(%rdi) |
| 189 | -#define h1 0x04(%rdi) |
| 190 | -#define h2 0x08(%rdi) |
| 191 | -#define h3 0x0c(%rdi) |
| 192 | -#define h4 0x10(%rdi) |
| 193 | -#define r0 0x00(%rdx) |
| 194 | -#define r1 0x04(%rdx) |
| 195 | -#define r2 0x08(%rdx) |
| 196 | -#define r3 0x0c(%rdx) |
| 197 | -#define r4 0x10(%rdx) |
| 198 | -#define u0 0x00(%r8) |
| 199 | -#define u1 0x04(%r8) |
| 200 | -#define u2 0x08(%r8) |
| 201 | -#define u3 0x0c(%r8) |
| 202 | -#define u4 0x10(%r8) |
| 203 | -#define w0 0x18(%r8) |
| 204 | -#define w1 0x1c(%r8) |
| 205 | -#define w2 0x20(%r8) |
| 206 | -#define w3 0x24(%r8) |
| 207 | -#define w4 0x28(%r8) |
| 208 | -#define y0 0x30(%r8) |
| 209 | -#define y1 0x34(%r8) |
| 210 | -#define y2 0x38(%r8) |
| 211 | -#define y3 0x3c(%r8) |
| 212 | -#define y4 0x40(%r8) |
| 213 | -#define m %rsi |
| 214 | -#define hc0 %ymm0 |
| 215 | -#define hc1 %ymm1 |
| 216 | -#define hc2 %ymm2 |
| 217 | -#define hc3 %ymm3 |
| 218 | -#define hc4 %ymm4 |
| 219 | -#define hc0x %xmm0 |
| 220 | -#define hc1x %xmm1 |
| 221 | -#define hc2x %xmm2 |
| 222 | -#define hc3x %xmm3 |
| 223 | -#define hc4x %xmm4 |
| 224 | -#define t1 %ymm5 |
| 225 | -#define t2 %ymm6 |
| 226 | -#define t1x %xmm5 |
| 227 | -#define t2x %xmm6 |
| 228 | -#define ruwy0 %ymm7 |
| 229 | -#define ruwy1 %ymm8 |
| 230 | -#define ruwy2 %ymm9 |
| 231 | -#define ruwy3 %ymm10 |
| 232 | -#define ruwy4 %ymm11 |
| 233 | -#define ruwy0x %xmm7 |
| 234 | -#define ruwy1x %xmm8 |
| 235 | -#define ruwy2x %xmm9 |
| 236 | -#define ruwy3x %xmm10 |
| 237 | -#define ruwy4x %xmm11 |
| 238 | -#define svxz1 %ymm12 |
| 239 | -#define svxz2 %ymm13 |
| 240 | -#define svxz3 %ymm14 |
| 241 | -#define svxz4 %ymm15 |
| 242 | -#define d0 %r9 |
| 243 | -#define d1 %r10 |
| 244 | -#define d2 %r11 |
| 245 | -#define d3 %r12 |
| 246 | -#define d4 %r13 |
| 247 | - |
| 248 | -ENTRY(poly1305_4block_avx2) |
| 249 | - # %rdi: Accumulator h[5] |
| 250 | - # %rsi: 64 byte input block m |
| 251 | - # %rdx: Poly1305 key r[5] |
| 252 | - # %rcx: Quadblock count |
| 253 | - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], |
| 254 | - |
| 255 | - # This four-block variant uses loop unrolled block processing. It |
| 256 | - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: |
| 257 | - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r |
| 258 | - |
| 259 | - vzeroupper |
| 260 | - push %rbx |
| 261 | - push %r12 |
| 262 | - push %r13 |
| 263 | - |
| 264 | - # combine r0,u0,w0,y0 |
| 265 | - vmovd y0,ruwy0x |
| 266 | - vmovd w0,t1x |
| 267 | - vpunpcklqdq t1,ruwy0,ruwy0 |
| 268 | - vmovd u0,t1x |
| 269 | - vmovd r0,t2x |
| 270 | - vpunpcklqdq t2,t1,t1 |
| 271 | - vperm2i128 $0x20,t1,ruwy0,ruwy0 |
| 272 | - |
| 273 | - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 |
| 274 | - vmovd y1,ruwy1x |
| 275 | - vmovd w1,t1x |
| 276 | - vpunpcklqdq t1,ruwy1,ruwy1 |
| 277 | - vmovd u1,t1x |
| 278 | - vmovd r1,t2x |
| 279 | - vpunpcklqdq t2,t1,t1 |
| 280 | - vperm2i128 $0x20,t1,ruwy1,ruwy1 |
| 281 | - vpslld $2,ruwy1,svxz1 |
| 282 | - vpaddd ruwy1,svxz1,svxz1 |
| 283 | - |
| 284 | - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 |
| 285 | - vmovd y2,ruwy2x |
| 286 | - vmovd w2,t1x |
| 287 | - vpunpcklqdq t1,ruwy2,ruwy2 |
| 288 | - vmovd u2,t1x |
| 289 | - vmovd r2,t2x |
| 290 | - vpunpcklqdq t2,t1,t1 |
| 291 | - vperm2i128 $0x20,t1,ruwy2,ruwy2 |
| 292 | - vpslld $2,ruwy2,svxz2 |
| 293 | - vpaddd ruwy2,svxz2,svxz2 |
| 294 | - |
| 295 | - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 |
| 296 | - vmovd y3,ruwy3x |
| 297 | - vmovd w3,t1x |
| 298 | - vpunpcklqdq t1,ruwy3,ruwy3 |
| 299 | - vmovd u3,t1x |
| 300 | - vmovd r3,t2x |
| 301 | - vpunpcklqdq t2,t1,t1 |
| 302 | - vperm2i128 $0x20,t1,ruwy3,ruwy3 |
| 303 | - vpslld $2,ruwy3,svxz3 |
| 304 | - vpaddd ruwy3,svxz3,svxz3 |
| 305 | - |
| 306 | - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 |
| 307 | - vmovd y4,ruwy4x |
| 308 | - vmovd w4,t1x |
| 309 | - vpunpcklqdq t1,ruwy4,ruwy4 |
| 310 | - vmovd u4,t1x |
| 311 | - vmovd r4,t2x |
| 312 | - vpunpcklqdq t2,t1,t1 |
| 313 | - vperm2i128 $0x20,t1,ruwy4,ruwy4 |
| 314 | - vpslld $2,ruwy4,svxz4 |
| 315 | - vpaddd ruwy4,svxz4,svxz4 |
| 316 | - |
| 317 | -.Ldoblock4: |
| 318 | - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, |
| 319 | - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] |
| 320 | - vmovd 0x00(m),hc0x |
| 321 | - vmovd 0x10(m),t1x |
| 322 | - vpunpcklqdq t1,hc0,hc0 |
| 323 | - vmovd 0x20(m),t1x |
| 324 | - vmovd 0x30(m),t2x |
| 325 | - vpunpcklqdq t2,t1,t1 |
| 326 | - vperm2i128 $0x20,t1,hc0,hc0 |
| 327 | - vpand ANMASK(%rip),hc0,hc0 |
| 328 | - vmovd h0,t1x |
| 329 | - vpaddd t1,hc0,hc0 |
| 330 | - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, |
| 331 | - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] |
| 332 | - vmovd 0x03(m),hc1x |
| 333 | - vmovd 0x13(m),t1x |
| 334 | - vpunpcklqdq t1,hc1,hc1 |
| 335 | - vmovd 0x23(m),t1x |
| 336 | - vmovd 0x33(m),t2x |
| 337 | - vpunpcklqdq t2,t1,t1 |
| 338 | - vperm2i128 $0x20,t1,hc1,hc1 |
| 339 | - vpsrld $2,hc1,hc1 |
| 340 | - vpand ANMASK(%rip),hc1,hc1 |
| 341 | - vmovd h1,t1x |
| 342 | - vpaddd t1,hc1,hc1 |
| 343 | - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, |
| 344 | - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] |
| 345 | - vmovd 0x06(m),hc2x |
| 346 | - vmovd 0x16(m),t1x |
| 347 | - vpunpcklqdq t1,hc2,hc2 |
| 348 | - vmovd 0x26(m),t1x |
| 349 | - vmovd 0x36(m),t2x |
| 350 | - vpunpcklqdq t2,t1,t1 |
| 351 | - vperm2i128 $0x20,t1,hc2,hc2 |
| 352 | - vpsrld $4,hc2,hc2 |
| 353 | - vpand ANMASK(%rip),hc2,hc2 |
| 354 | - vmovd h2,t1x |
| 355 | - vpaddd t1,hc2,hc2 |
| 356 | - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, |
| 357 | - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] |
| 358 | - vmovd 0x09(m),hc3x |
| 359 | - vmovd 0x19(m),t1x |
| 360 | - vpunpcklqdq t1,hc3,hc3 |
| 361 | - vmovd 0x29(m),t1x |
| 362 | - vmovd 0x39(m),t2x |
| 363 | - vpunpcklqdq t2,t1,t1 |
| 364 | - vperm2i128 $0x20,t1,hc3,hc3 |
| 365 | - vpsrld $6,hc3,hc3 |
| 366 | - vpand ANMASK(%rip),hc3,hc3 |
| 367 | - vmovd h3,t1x |
| 368 | - vpaddd t1,hc3,hc3 |
| 369 | - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), |
| 370 | - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] |
| 371 | - vmovd 0x0c(m),hc4x |
| 372 | - vmovd 0x1c(m),t1x |
| 373 | - vpunpcklqdq t1,hc4,hc4 |
| 374 | - vmovd 0x2c(m),t1x |
| 375 | - vmovd 0x3c(m),t2x |
| 376 | - vpunpcklqdq t2,t1,t1 |
| 377 | - vperm2i128 $0x20,t1,hc4,hc4 |
| 378 | - vpsrld $8,hc4,hc4 |
| 379 | - vpor ORMASK(%rip),hc4,hc4 |
| 380 | - vmovd h4,t1x |
| 381 | - vpaddd t1,hc4,hc4 |
| 382 | - |
| 383 | - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] |
| 384 | - vpmuludq hc0,ruwy0,t1 |
| 385 | - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] |
| 386 | - vpmuludq hc1,svxz4,t2 |
| 387 | - vpaddq t2,t1,t1 |
| 388 | - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] |
| 389 | - vpmuludq hc2,svxz3,t2 |
| 390 | - vpaddq t2,t1,t1 |
| 391 | - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] |
| 392 | - vpmuludq hc3,svxz2,t2 |
| 393 | - vpaddq t2,t1,t1 |
| 394 | - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] |
| 395 | - vpmuludq hc4,svxz1,t2 |
| 396 | - vpaddq t2,t1,t1 |
| 397 | - # d0 = t1[0] + t1[1] + t[2] + t[3] |
| 398 | - vpermq $0xee,t1,t2 |
| 399 | - vpaddq t2,t1,t1 |
| 400 | - vpsrldq $8,t1,t2 |
| 401 | - vpaddq t2,t1,t1 |
| 402 | - vmovq t1x,d0 |
| 403 | - |
| 404 | - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] |
| 405 | - vpmuludq hc0,ruwy1,t1 |
| 406 | - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] |
| 407 | - vpmuludq hc1,ruwy0,t2 |
| 408 | - vpaddq t2,t1,t1 |
| 409 | - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] |
| 410 | - vpmuludq hc2,svxz4,t2 |
| 411 | - vpaddq t2,t1,t1 |
| 412 | - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] |
| 413 | - vpmuludq hc3,svxz3,t2 |
| 414 | - vpaddq t2,t1,t1 |
| 415 | - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] |
| 416 | - vpmuludq hc4,svxz2,t2 |
| 417 | - vpaddq t2,t1,t1 |
| 418 | - # d1 = t1[0] + t1[1] + t1[3] + t1[4] |
| 419 | - vpermq $0xee,t1,t2 |
| 420 | - vpaddq t2,t1,t1 |
| 421 | - vpsrldq $8,t1,t2 |
| 422 | - vpaddq t2,t1,t1 |
| 423 | - vmovq t1x,d1 |
| 424 | - |
| 425 | - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] |
| 426 | - vpmuludq hc0,ruwy2,t1 |
| 427 | - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] |
| 428 | - vpmuludq hc1,ruwy1,t2 |
| 429 | - vpaddq t2,t1,t1 |
| 430 | - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] |
| 431 | - vpmuludq hc2,ruwy0,t2 |
| 432 | - vpaddq t2,t1,t1 |
| 433 | - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] |
| 434 | - vpmuludq hc3,svxz4,t2 |
| 435 | - vpaddq t2,t1,t1 |
| 436 | - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] |
| 437 | - vpmuludq hc4,svxz3,t2 |
| 438 | - vpaddq t2,t1,t1 |
| 439 | - # d2 = t1[0] + t1[1] + t1[2] + t1[3] |
| 440 | - vpermq $0xee,t1,t2 |
| 441 | - vpaddq t2,t1,t1 |
| 442 | - vpsrldq $8,t1,t2 |
| 443 | - vpaddq t2,t1,t1 |
| 444 | - vmovq t1x,d2 |
| 445 | - |
| 446 | - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] |
| 447 | - vpmuludq hc0,ruwy3,t1 |
| 448 | - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] |
| 449 | - vpmuludq hc1,ruwy2,t2 |
| 450 | - vpaddq t2,t1,t1 |
| 451 | - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] |
| 452 | - vpmuludq hc2,ruwy1,t2 |
| 453 | - vpaddq t2,t1,t1 |
| 454 | - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] |
| 455 | - vpmuludq hc3,ruwy0,t2 |
| 456 | - vpaddq t2,t1,t1 |
| 457 | - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] |
| 458 | - vpmuludq hc4,svxz4,t2 |
| 459 | - vpaddq t2,t1,t1 |
| 460 | - # d3 = t1[0] + t1[1] + t1[2] + t1[3] |
| 461 | - vpermq $0xee,t1,t2 |
| 462 | - vpaddq t2,t1,t1 |
| 463 | - vpsrldq $8,t1,t2 |
| 464 | - vpaddq t2,t1,t1 |
| 465 | - vmovq t1x,d3 |
| 466 | - |
| 467 | - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] |
| 468 | - vpmuludq hc0,ruwy4,t1 |
| 469 | - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] |
| 470 | - vpmuludq hc1,ruwy3,t2 |
| 471 | - vpaddq t2,t1,t1 |
| 472 | - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] |
| 473 | - vpmuludq hc2,ruwy2,t2 |
| 474 | - vpaddq t2,t1,t1 |
| 475 | - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] |
| 476 | - vpmuludq hc3,ruwy1,t2 |
| 477 | - vpaddq t2,t1,t1 |
| 478 | - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] |
| 479 | - vpmuludq hc4,ruwy0,t2 |
| 480 | - vpaddq t2,t1,t1 |
| 481 | - # d4 = t1[0] + t1[1] + t1[2] + t1[3] |
| 482 | - vpermq $0xee,t1,t2 |
| 483 | - vpaddq t2,t1,t1 |
| 484 | - vpsrldq $8,t1,t2 |
| 485 | - vpaddq t2,t1,t1 |
| 486 | - vmovq t1x,d4 |
| 487 | - |
| 488 | - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> |
| 489 | - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small |
| 490 | - # amount. Careful: we must not assume the carry bits 'd0 >> 26', |
| 491 | - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit |
| 492 | - # integers. It's true in a single-block implementation, but not here. |
| 493 | - |
| 494 | - # d1 += d0 >> 26 |
| 495 | - mov d0,%rax |
| 496 | - shr $26,%rax |
| 497 | - add %rax,d1 |
| 498 | - # h0 = d0 & 0x3ffffff |
| 499 | - mov d0,%rbx |
| 500 | - and $0x3ffffff,%ebx |
| 501 | - |
| 502 | - # d2 += d1 >> 26 |
| 503 | - mov d1,%rax |
| 504 | - shr $26,%rax |
| 505 | - add %rax,d2 |
| 506 | - # h1 = d1 & 0x3ffffff |
| 507 | - mov d1,%rax |
| 508 | - and $0x3ffffff,%eax |
| 509 | - mov %eax,h1 |
| 510 | - |
| 511 | - # d3 += d2 >> 26 |
| 512 | - mov d2,%rax |
| 513 | - shr $26,%rax |
| 514 | - add %rax,d3 |
| 515 | - # h2 = d2 & 0x3ffffff |
| 516 | - mov d2,%rax |
| 517 | - and $0x3ffffff,%eax |
| 518 | - mov %eax,h2 |
| 519 | - |
| 520 | - # d4 += d3 >> 26 |
| 521 | - mov d3,%rax |
| 522 | - shr $26,%rax |
| 523 | - add %rax,d4 |
| 524 | - # h3 = d3 & 0x3ffffff |
| 525 | - mov d3,%rax |
| 526 | - and $0x3ffffff,%eax |
| 527 | - mov %eax,h3 |
| 528 | - |
| 529 | - # h0 += (d4 >> 26) * 5 |
| 530 | - mov d4,%rax |
| 531 | - shr $26,%rax |
| 532 | - lea (%rax,%rax,4),%rax |
| 533 | - add %rax,%rbx |
| 534 | - # h4 = d4 & 0x3ffffff |
| 535 | - mov d4,%rax |
| 536 | - and $0x3ffffff,%eax |
| 537 | - mov %eax,h4 |
| 538 | - |
| 539 | - # h1 += h0 >> 26 |
| 540 | - mov %rbx,%rax |
| 541 | - shr $26,%rax |
| 542 | - add %eax,h1 |
| 543 | - # h0 = h0 & 0x3ffffff |
| 544 | - andl $0x3ffffff,%ebx |
| 545 | - mov %ebx,h0 |
| 546 | - |
| 547 | - add $0x40,m |
| 548 | - dec %rcx |
| 549 | - jnz .Ldoblock4 |
| 550 | - |
| 551 | - vzeroupper |
| 552 | - pop %r13 |
| 553 | - pop %r12 |
| 554 | - pop %rbx |
| 555 | - ret |
| 556 | -ENDPROC(poly1305_4block_avx2) |
| 557 | --- a/arch/x86/crypto/poly1305-sse2-x86_64.S |
| 558 | +++ /dev/null |
| 559 | @@ -1,590 +0,0 @@ |
| 560 | -/* SPDX-License-Identifier: GPL-2.0-or-later */ |
| 561 | -/* |
| 562 | - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions |
| 563 | - * |
| 564 | - * Copyright (C) 2015 Martin Willi |
| 565 | - */ |
| 566 | - |
| 567 | -#include <linux/linkage.h> |
| 568 | - |
| 569 | -.section .rodata.cst16.ANMASK, "aM", @progbits, 16 |
| 570 | -.align 16 |
| 571 | -ANMASK: .octa 0x0000000003ffffff0000000003ffffff |
| 572 | - |
| 573 | -.section .rodata.cst16.ORMASK, "aM", @progbits, 16 |
| 574 | -.align 16 |
| 575 | -ORMASK: .octa 0x00000000010000000000000001000000 |
| 576 | - |
| 577 | -.text |
| 578 | - |
| 579 | -#define h0 0x00(%rdi) |
| 580 | -#define h1 0x04(%rdi) |
| 581 | -#define h2 0x08(%rdi) |
| 582 | -#define h3 0x0c(%rdi) |
| 583 | -#define h4 0x10(%rdi) |
| 584 | -#define r0 0x00(%rdx) |
| 585 | -#define r1 0x04(%rdx) |
| 586 | -#define r2 0x08(%rdx) |
| 587 | -#define r3 0x0c(%rdx) |
| 588 | -#define r4 0x10(%rdx) |
| 589 | -#define s1 0x00(%rsp) |
| 590 | -#define s2 0x04(%rsp) |
| 591 | -#define s3 0x08(%rsp) |
| 592 | -#define s4 0x0c(%rsp) |
| 593 | -#define m %rsi |
| 594 | -#define h01 %xmm0 |
| 595 | -#define h23 %xmm1 |
| 596 | -#define h44 %xmm2 |
| 597 | -#define t1 %xmm3 |
| 598 | -#define t2 %xmm4 |
| 599 | -#define t3 %xmm5 |
| 600 | -#define t4 %xmm6 |
| 601 | -#define mask %xmm7 |
| 602 | -#define d0 %r8 |
| 603 | -#define d1 %r9 |
| 604 | -#define d2 %r10 |
| 605 | -#define d3 %r11 |
| 606 | -#define d4 %r12 |
| 607 | - |
| 608 | -ENTRY(poly1305_block_sse2) |
| 609 | - # %rdi: Accumulator h[5] |
| 610 | - # %rsi: 16 byte input block m |
| 611 | - # %rdx: Poly1305 key r[5] |
| 612 | - # %rcx: Block count |
| 613 | - |
| 614 | - # This single block variant tries to improve performance by doing two |
| 615 | - # multiplications in parallel using SSE instructions. There is quite |
| 616 | - # some quardword packing involved, hence the speedup is marginal. |
| 617 | - |
| 618 | - push %rbx |
| 619 | - push %r12 |
| 620 | - sub $0x10,%rsp |
| 621 | - |
| 622 | - # s1..s4 = r1..r4 * 5 |
| 623 | - mov r1,%eax |
| 624 | - lea (%eax,%eax,4),%eax |
| 625 | - mov %eax,s1 |
| 626 | - mov r2,%eax |
| 627 | - lea (%eax,%eax,4),%eax |
| 628 | - mov %eax,s2 |
| 629 | - mov r3,%eax |
| 630 | - lea (%eax,%eax,4),%eax |
| 631 | - mov %eax,s3 |
| 632 | - mov r4,%eax |
| 633 | - lea (%eax,%eax,4),%eax |
| 634 | - mov %eax,s4 |
| 635 | - |
| 636 | - movdqa ANMASK(%rip),mask |
| 637 | - |
| 638 | -.Ldoblock: |
| 639 | - # h01 = [0, h1, 0, h0] |
| 640 | - # h23 = [0, h3, 0, h2] |
| 641 | - # h44 = [0, h4, 0, h4] |
| 642 | - movd h0,h01 |
| 643 | - movd h1,t1 |
| 644 | - movd h2,h23 |
| 645 | - movd h3,t2 |
| 646 | - movd h4,h44 |
| 647 | - punpcklqdq t1,h01 |
| 648 | - punpcklqdq t2,h23 |
| 649 | - punpcklqdq h44,h44 |
| 650 | - |
| 651 | - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] |
| 652 | - movd 0x00(m),t1 |
| 653 | - movd 0x03(m),t2 |
| 654 | - psrld $2,t2 |
| 655 | - punpcklqdq t2,t1 |
| 656 | - pand mask,t1 |
| 657 | - paddd t1,h01 |
| 658 | - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] |
| 659 | - movd 0x06(m),t1 |
| 660 | - movd 0x09(m),t2 |
| 661 | - psrld $4,t1 |
| 662 | - psrld $6,t2 |
| 663 | - punpcklqdq t2,t1 |
| 664 | - pand mask,t1 |
| 665 | - paddd t1,h23 |
| 666 | - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] |
| 667 | - mov 0x0c(m),%eax |
| 668 | - shr $8,%eax |
| 669 | - or $0x01000000,%eax |
| 670 | - movd %eax,t1 |
| 671 | - pshufd $0xc4,t1,t1 |
| 672 | - paddd t1,h44 |
| 673 | - |
| 674 | - # t1[0] = h0 * r0 + h2 * s3 |
| 675 | - # t1[1] = h1 * s4 + h3 * s2 |
| 676 | - movd r0,t1 |
| 677 | - movd s4,t2 |
| 678 | - punpcklqdq t2,t1 |
| 679 | - pmuludq h01,t1 |
| 680 | - movd s3,t2 |
| 681 | - movd s2,t3 |
| 682 | - punpcklqdq t3,t2 |
| 683 | - pmuludq h23,t2 |
| 684 | - paddq t2,t1 |
| 685 | - # t2[0] = h0 * r1 + h2 * s4 |
| 686 | - # t2[1] = h1 * r0 + h3 * s3 |
| 687 | - movd r1,t2 |
| 688 | - movd r0,t3 |
| 689 | - punpcklqdq t3,t2 |
| 690 | - pmuludq h01,t2 |
| 691 | - movd s4,t3 |
| 692 | - movd s3,t4 |
| 693 | - punpcklqdq t4,t3 |
| 694 | - pmuludq h23,t3 |
| 695 | - paddq t3,t2 |
| 696 | - # t3[0] = h4 * s1 |
| 697 | - # t3[1] = h4 * s2 |
| 698 | - movd s1,t3 |
| 699 | - movd s2,t4 |
| 700 | - punpcklqdq t4,t3 |
| 701 | - pmuludq h44,t3 |
| 702 | - # d0 = t1[0] + t1[1] + t3[0] |
| 703 | - # d1 = t2[0] + t2[1] + t3[1] |
| 704 | - movdqa t1,t4 |
| 705 | - punpcklqdq t2,t4 |
| 706 | - punpckhqdq t2,t1 |
| 707 | - paddq t4,t1 |
| 708 | - paddq t3,t1 |
| 709 | - movq t1,d0 |
| 710 | - psrldq $8,t1 |
| 711 | - movq t1,d1 |
| 712 | - |
| 713 | - # t1[0] = h0 * r2 + h2 * r0 |
| 714 | - # t1[1] = h1 * r1 + h3 * s4 |
| 715 | - movd r2,t1 |
| 716 | - movd r1,t2 |
| 717 | - punpcklqdq t2,t1 |
| 718 | - pmuludq h01,t1 |
| 719 | - movd r0,t2 |
| 720 | - movd s4,t3 |
| 721 | - punpcklqdq t3,t2 |
| 722 | - pmuludq h23,t2 |
| 723 | - paddq t2,t1 |
| 724 | - # t2[0] = h0 * r3 + h2 * r1 |
| 725 | - # t2[1] = h1 * r2 + h3 * r0 |
| 726 | - movd r3,t2 |
| 727 | - movd r2,t3 |
| 728 | - punpcklqdq t3,t2 |
| 729 | - pmuludq h01,t2 |
| 730 | - movd r1,t3 |
| 731 | - movd r0,t4 |
| 732 | - punpcklqdq t4,t3 |
| 733 | - pmuludq h23,t3 |
| 734 | - paddq t3,t2 |
| 735 | - # t3[0] = h4 * s3 |
| 736 | - # t3[1] = h4 * s4 |
| 737 | - movd s3,t3 |
| 738 | - movd s4,t4 |
| 739 | - punpcklqdq t4,t3 |
| 740 | - pmuludq h44,t3 |
| 741 | - # d2 = t1[0] + t1[1] + t3[0] |
| 742 | - # d3 = t2[0] + t2[1] + t3[1] |
| 743 | - movdqa t1,t4 |
| 744 | - punpcklqdq t2,t4 |
| 745 | - punpckhqdq t2,t1 |
| 746 | - paddq t4,t1 |
| 747 | - paddq t3,t1 |
| 748 | - movq t1,d2 |
| 749 | - psrldq $8,t1 |
| 750 | - movq t1,d3 |
| 751 | - |
| 752 | - # t1[0] = h0 * r4 + h2 * r2 |
| 753 | - # t1[1] = h1 * r3 + h3 * r1 |
| 754 | - movd r4,t1 |
| 755 | - movd r3,t2 |
| 756 | - punpcklqdq t2,t1 |
| 757 | - pmuludq h01,t1 |
| 758 | - movd r2,t2 |
| 759 | - movd r1,t3 |
| 760 | - punpcklqdq t3,t2 |
| 761 | - pmuludq h23,t2 |
| 762 | - paddq t2,t1 |
| 763 | - # t3[0] = h4 * r0 |
| 764 | - movd r0,t3 |
| 765 | - pmuludq h44,t3 |
| 766 | - # d4 = t1[0] + t1[1] + t3[0] |
| 767 | - movdqa t1,t4 |
| 768 | - psrldq $8,t4 |
| 769 | - paddq t4,t1 |
| 770 | - paddq t3,t1 |
| 771 | - movq t1,d4 |
| 772 | - |
| 773 | - # d1 += d0 >> 26 |
| 774 | - mov d0,%rax |
| 775 | - shr $26,%rax |
| 776 | - add %rax,d1 |
| 777 | - # h0 = d0 & 0x3ffffff |
| 778 | - mov d0,%rbx |
| 779 | - and $0x3ffffff,%ebx |
| 780 | - |
| 781 | - # d2 += d1 >> 26 |
| 782 | - mov d1,%rax |
| 783 | - shr $26,%rax |
| 784 | - add %rax,d2 |
| 785 | - # h1 = d1 & 0x3ffffff |
| 786 | - mov d1,%rax |
| 787 | - and $0x3ffffff,%eax |
| 788 | - mov %eax,h1 |
| 789 | - |
| 790 | - # d3 += d2 >> 26 |
| 791 | - mov d2,%rax |
| 792 | - shr $26,%rax |
| 793 | - add %rax,d3 |
| 794 | - # h2 = d2 & 0x3ffffff |
| 795 | - mov d2,%rax |
| 796 | - and $0x3ffffff,%eax |
| 797 | - mov %eax,h2 |
| 798 | - |
| 799 | - # d4 += d3 >> 26 |
| 800 | - mov d3,%rax |
| 801 | - shr $26,%rax |
| 802 | - add %rax,d4 |
| 803 | - # h3 = d3 & 0x3ffffff |
| 804 | - mov d3,%rax |
| 805 | - and $0x3ffffff,%eax |
| 806 | - mov %eax,h3 |
| 807 | - |
| 808 | - # h0 += (d4 >> 26) * 5 |
| 809 | - mov d4,%rax |
| 810 | - shr $26,%rax |
| 811 | - lea (%rax,%rax,4),%rax |
| 812 | - add %rax,%rbx |
| 813 | - # h4 = d4 & 0x3ffffff |
| 814 | - mov d4,%rax |
| 815 | - and $0x3ffffff,%eax |
| 816 | - mov %eax,h4 |
| 817 | - |
| 818 | - # h1 += h0 >> 26 |
| 819 | - mov %rbx,%rax |
| 820 | - shr $26,%rax |
| 821 | - add %eax,h1 |
| 822 | - # h0 = h0 & 0x3ffffff |
| 823 | - andl $0x3ffffff,%ebx |
| 824 | - mov %ebx,h0 |
| 825 | - |
| 826 | - add $0x10,m |
| 827 | - dec %rcx |
| 828 | - jnz .Ldoblock |
| 829 | - |
| 830 | - # Zeroing of key material |
| 831 | - mov %rcx,0x00(%rsp) |
| 832 | - mov %rcx,0x08(%rsp) |
| 833 | - |
| 834 | - add $0x10,%rsp |
| 835 | - pop %r12 |
| 836 | - pop %rbx |
| 837 | - ret |
| 838 | -ENDPROC(poly1305_block_sse2) |
| 839 | - |
| 840 | - |
| 841 | -#define u0 0x00(%r8) |
| 842 | -#define u1 0x04(%r8) |
| 843 | -#define u2 0x08(%r8) |
| 844 | -#define u3 0x0c(%r8) |
| 845 | -#define u4 0x10(%r8) |
| 846 | -#define hc0 %xmm0 |
| 847 | -#define hc1 %xmm1 |
| 848 | -#define hc2 %xmm2 |
| 849 | -#define hc3 %xmm5 |
| 850 | -#define hc4 %xmm6 |
| 851 | -#define ru0 %xmm7 |
| 852 | -#define ru1 %xmm8 |
| 853 | -#define ru2 %xmm9 |
| 854 | -#define ru3 %xmm10 |
| 855 | -#define ru4 %xmm11 |
| 856 | -#define sv1 %xmm12 |
| 857 | -#define sv2 %xmm13 |
| 858 | -#define sv3 %xmm14 |
| 859 | -#define sv4 %xmm15 |
| 860 | -#undef d0 |
| 861 | -#define d0 %r13 |
| 862 | - |
| 863 | -ENTRY(poly1305_2block_sse2) |
| 864 | - # %rdi: Accumulator h[5] |
| 865 | - # %rsi: 16 byte input block m |
| 866 | - # %rdx: Poly1305 key r[5] |
| 867 | - # %rcx: Doubleblock count |
| 868 | - # %r8: Poly1305 derived key r^2 u[5] |
| 869 | - |
| 870 | - # This two-block variant further improves performance by using loop |
| 871 | - # unrolled block processing. This is more straight forward and does |
| 872 | - # less byte shuffling, but requires a second Poly1305 key r^2: |
| 873 | - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r |
| 874 | - |
| 875 | - push %rbx |
| 876 | - push %r12 |
| 877 | - push %r13 |
| 878 | - |
| 879 | - # combine r0,u0 |
| 880 | - movd u0,ru0 |
| 881 | - movd r0,t1 |
| 882 | - punpcklqdq t1,ru0 |
| 883 | - |
| 884 | - # combine r1,u1 and s1=r1*5,v1=u1*5 |
| 885 | - movd u1,ru1 |
| 886 | - movd r1,t1 |
| 887 | - punpcklqdq t1,ru1 |
| 888 | - movdqa ru1,sv1 |
| 889 | - pslld $2,sv1 |
| 890 | - paddd ru1,sv1 |
| 891 | - |
| 892 | - # combine r2,u2 and s2=r2*5,v2=u2*5 |
| 893 | - movd u2,ru2 |
| 894 | - movd r2,t1 |
| 895 | - punpcklqdq t1,ru2 |
| 896 | - movdqa ru2,sv2 |
| 897 | - pslld $2,sv2 |
| 898 | - paddd ru2,sv2 |
| 899 | - |
| 900 | - # combine r3,u3 and s3=r3*5,v3=u3*5 |
| 901 | - movd u3,ru3 |
| 902 | - movd r3,t1 |
| 903 | - punpcklqdq t1,ru3 |
| 904 | - movdqa ru3,sv3 |
| 905 | - pslld $2,sv3 |
| 906 | - paddd ru3,sv3 |
| 907 | - |
| 908 | - # combine r4,u4 and s4=r4*5,v4=u4*5 |
| 909 | - movd u4,ru4 |
| 910 | - movd r4,t1 |
| 911 | - punpcklqdq t1,ru4 |
| 912 | - movdqa ru4,sv4 |
| 913 | - pslld $2,sv4 |
| 914 | - paddd ru4,sv4 |
| 915 | - |
| 916 | -.Ldoblock2: |
| 917 | - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] |
| 918 | - movd 0x00(m),hc0 |
| 919 | - movd 0x10(m),t1 |
| 920 | - punpcklqdq t1,hc0 |
| 921 | - pand ANMASK(%rip),hc0 |
| 922 | - movd h0,t1 |
| 923 | - paddd t1,hc0 |
| 924 | - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] |
| 925 | - movd 0x03(m),hc1 |
| 926 | - movd 0x13(m),t1 |
| 927 | - punpcklqdq t1,hc1 |
| 928 | - psrld $2,hc1 |
| 929 | - pand ANMASK(%rip),hc1 |
| 930 | - movd h1,t1 |
| 931 | - paddd t1,hc1 |
| 932 | - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] |
| 933 | - movd 0x06(m),hc2 |
| 934 | - movd 0x16(m),t1 |
| 935 | - punpcklqdq t1,hc2 |
| 936 | - psrld $4,hc2 |
| 937 | - pand ANMASK(%rip),hc2 |
| 938 | - movd h2,t1 |
| 939 | - paddd t1,hc2 |
| 940 | - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] |
| 941 | - movd 0x09(m),hc3 |
| 942 | - movd 0x19(m),t1 |
| 943 | - punpcklqdq t1,hc3 |
| 944 | - psrld $6,hc3 |
| 945 | - pand ANMASK(%rip),hc3 |
| 946 | - movd h3,t1 |
| 947 | - paddd t1,hc3 |
| 948 | - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] |
| 949 | - movd 0x0c(m),hc4 |
| 950 | - movd 0x1c(m),t1 |
| 951 | - punpcklqdq t1,hc4 |
| 952 | - psrld $8,hc4 |
| 953 | - por ORMASK(%rip),hc4 |
| 954 | - movd h4,t1 |
| 955 | - paddd t1,hc4 |
| 956 | - |
| 957 | - # t1 = [ hc0[1] * r0, hc0[0] * u0 ] |
| 958 | - movdqa ru0,t1 |
| 959 | - pmuludq hc0,t1 |
| 960 | - # t1 += [ hc1[1] * s4, hc1[0] * v4 ] |
| 961 | - movdqa sv4,t2 |
| 962 | - pmuludq hc1,t2 |
| 963 | - paddq t2,t1 |
| 964 | - # t1 += [ hc2[1] * s3, hc2[0] * v3 ] |
| 965 | - movdqa sv3,t2 |
| 966 | - pmuludq hc2,t2 |
| 967 | - paddq t2,t1 |
| 968 | - # t1 += [ hc3[1] * s2, hc3[0] * v2 ] |
| 969 | - movdqa sv2,t2 |
| 970 | - pmuludq hc3,t2 |
| 971 | - paddq t2,t1 |
| 972 | - # t1 += [ hc4[1] * s1, hc4[0] * v1 ] |
| 973 | - movdqa sv1,t2 |
| 974 | - pmuludq hc4,t2 |
| 975 | - paddq t2,t1 |
| 976 | - # d0 = t1[0] + t1[1] |
| 977 | - movdqa t1,t2 |
| 978 | - psrldq $8,t2 |
| 979 | - paddq t2,t1 |
| 980 | - movq t1,d0 |
| 981 | - |
| 982 | - # t1 = [ hc0[1] * r1, hc0[0] * u1 ] |
| 983 | - movdqa ru1,t1 |
| 984 | - pmuludq hc0,t1 |
| 985 | - # t1 += [ hc1[1] * r0, hc1[0] * u0 ] |
| 986 | - movdqa ru0,t2 |
| 987 | - pmuludq hc1,t2 |
| 988 | - paddq t2,t1 |
| 989 | - # t1 += [ hc2[1] * s4, hc2[0] * v4 ] |
| 990 | - movdqa sv4,t2 |
| 991 | - pmuludq hc2,t2 |
| 992 | - paddq t2,t1 |
| 993 | - # t1 += [ hc3[1] * s3, hc3[0] * v3 ] |
| 994 | - movdqa sv3,t2 |
| 995 | - pmuludq hc3,t2 |
| 996 | - paddq t2,t1 |
| 997 | - # t1 += [ hc4[1] * s2, hc4[0] * v2 ] |
| 998 | - movdqa sv2,t2 |
| 999 | - pmuludq hc4,t2 |
| 1000 | - paddq t2,t1 |
| 1001 | - # d1 = t1[0] + t1[1] |
| 1002 | - movdqa t1,t2 |
| 1003 | - psrldq $8,t2 |
| 1004 | - paddq t2,t1 |
| 1005 | - movq t1,d1 |
| 1006 | - |
| 1007 | - # t1 = [ hc0[1] * r2, hc0[0] * u2 ] |
| 1008 | - movdqa ru2,t1 |
| 1009 | - pmuludq hc0,t1 |
| 1010 | - # t1 += [ hc1[1] * r1, hc1[0] * u1 ] |
| 1011 | - movdqa ru1,t2 |
| 1012 | - pmuludq hc1,t2 |
| 1013 | - paddq t2,t1 |
| 1014 | - # t1 += [ hc2[1] * r0, hc2[0] * u0 ] |
| 1015 | - movdqa ru0,t2 |
| 1016 | - pmuludq hc2,t2 |
| 1017 | - paddq t2,t1 |
| 1018 | - # t1 += [ hc3[1] * s4, hc3[0] * v4 ] |
| 1019 | - movdqa sv4,t2 |
| 1020 | - pmuludq hc3,t2 |
| 1021 | - paddq t2,t1 |
| 1022 | - # t1 += [ hc4[1] * s3, hc4[0] * v3 ] |
| 1023 | - movdqa sv3,t2 |
| 1024 | - pmuludq hc4,t2 |
| 1025 | - paddq t2,t1 |
| 1026 | - # d2 = t1[0] + t1[1] |
| 1027 | - movdqa t1,t2 |
| 1028 | - psrldq $8,t2 |
| 1029 | - paddq t2,t1 |
| 1030 | - movq t1,d2 |
| 1031 | - |
| 1032 | - # t1 = [ hc0[1] * r3, hc0[0] * u3 ] |
| 1033 | - movdqa ru3,t1 |
| 1034 | - pmuludq hc0,t1 |
| 1035 | - # t1 += [ hc1[1] * r2, hc1[0] * u2 ] |
| 1036 | - movdqa ru2,t2 |
| 1037 | - pmuludq hc1,t2 |
| 1038 | - paddq t2,t1 |
| 1039 | - # t1 += [ hc2[1] * r1, hc2[0] * u1 ] |
| 1040 | - movdqa ru1,t2 |
| 1041 | - pmuludq hc2,t2 |
| 1042 | - paddq t2,t1 |
| 1043 | - # t1 += [ hc3[1] * r0, hc3[0] * u0 ] |
| 1044 | - movdqa ru0,t2 |
| 1045 | - pmuludq hc3,t2 |
| 1046 | - paddq t2,t1 |
| 1047 | - # t1 += [ hc4[1] * s4, hc4[0] * v4 ] |
| 1048 | - movdqa sv4,t2 |
| 1049 | - pmuludq hc4,t2 |
| 1050 | - paddq t2,t1 |
| 1051 | - # d3 = t1[0] + t1[1] |
| 1052 | - movdqa t1,t2 |
| 1053 | - psrldq $8,t2 |
| 1054 | - paddq t2,t1 |
| 1055 | - movq t1,d3 |
| 1056 | - |
| 1057 | - # t1 = [ hc0[1] * r4, hc0[0] * u4 ] |
| 1058 | - movdqa ru4,t1 |
| 1059 | - pmuludq hc0,t1 |
| 1060 | - # t1 += [ hc1[1] * r3, hc1[0] * u3 ] |
| 1061 | - movdqa ru3,t2 |
| 1062 | - pmuludq hc1,t2 |
| 1063 | - paddq t2,t1 |
| 1064 | - # t1 += [ hc2[1] * r2, hc2[0] * u2 ] |
| 1065 | - movdqa ru2,t2 |
| 1066 | - pmuludq hc2,t2 |
| 1067 | - paddq t2,t1 |
| 1068 | - # t1 += [ hc3[1] * r1, hc3[0] * u1 ] |
| 1069 | - movdqa ru1,t2 |
| 1070 | - pmuludq hc3,t2 |
| 1071 | - paddq t2,t1 |
| 1072 | - # t1 += [ hc4[1] * r0, hc4[0] * u0 ] |
| 1073 | - movdqa ru0,t2 |
| 1074 | - pmuludq hc4,t2 |
| 1075 | - paddq t2,t1 |
| 1076 | - # d4 = t1[0] + t1[1] |
| 1077 | - movdqa t1,t2 |
| 1078 | - psrldq $8,t2 |
| 1079 | - paddq t2,t1 |
| 1080 | - movq t1,d4 |
| 1081 | - |
| 1082 | - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> |
| 1083 | - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small |
| 1084 | - # amount. Careful: we must not assume the carry bits 'd0 >> 26', |
| 1085 | - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit |
| 1086 | - # integers. It's true in a single-block implementation, but not here. |
| 1087 | - |
| 1088 | - # d1 += d0 >> 26 |
| 1089 | - mov d0,%rax |
| 1090 | - shr $26,%rax |
| 1091 | - add %rax,d1 |
| 1092 | - # h0 = d0 & 0x3ffffff |
| 1093 | - mov d0,%rbx |
| 1094 | - and $0x3ffffff,%ebx |
| 1095 | - |
| 1096 | - # d2 += d1 >> 26 |
| 1097 | - mov d1,%rax |
| 1098 | - shr $26,%rax |
| 1099 | - add %rax,d2 |
| 1100 | - # h1 = d1 & 0x3ffffff |
| 1101 | - mov d1,%rax |
| 1102 | - and $0x3ffffff,%eax |
| 1103 | - mov %eax,h1 |
| 1104 | - |
| 1105 | - # d3 += d2 >> 26 |
| 1106 | - mov d2,%rax |
| 1107 | - shr $26,%rax |
| 1108 | - add %rax,d3 |
| 1109 | - # h2 = d2 & 0x3ffffff |
| 1110 | - mov d2,%rax |
| 1111 | - and $0x3ffffff,%eax |
| 1112 | - mov %eax,h2 |
| 1113 | - |
| 1114 | - # d4 += d3 >> 26 |
| 1115 | - mov d3,%rax |
| 1116 | - shr $26,%rax |
| 1117 | - add %rax,d4 |
| 1118 | - # h3 = d3 & 0x3ffffff |
| 1119 | - mov d3,%rax |
| 1120 | - and $0x3ffffff,%eax |
| 1121 | - mov %eax,h3 |
| 1122 | - |
| 1123 | - # h0 += (d4 >> 26) * 5 |
| 1124 | - mov d4,%rax |
| 1125 | - shr $26,%rax |
| 1126 | - lea (%rax,%rax,4),%rax |
| 1127 | - add %rax,%rbx |
| 1128 | - # h4 = d4 & 0x3ffffff |
| 1129 | - mov d4,%rax |
| 1130 | - and $0x3ffffff,%eax |
| 1131 | - mov %eax,h4 |
| 1132 | - |
| 1133 | - # h1 += h0 >> 26 |
| 1134 | - mov %rbx,%rax |
| 1135 | - shr $26,%rax |
| 1136 | - add %eax,h1 |
| 1137 | - # h0 = h0 & 0x3ffffff |
| 1138 | - andl $0x3ffffff,%ebx |
| 1139 | - mov %ebx,h0 |
| 1140 | - |
| 1141 | - add $0x20,m |
| 1142 | - dec %rcx |
| 1143 | - jnz .Ldoblock2 |
| 1144 | - |
| 1145 | - pop %r13 |
| 1146 | - pop %r12 |
| 1147 | - pop %rbx |
| 1148 | - ret |
| 1149 | -ENDPROC(poly1305_2block_sse2) |
| 1150 | --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl |
| 1151 | +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl |
| 1152 | @@ -1,11 +1,14 @@ |
| 1153 | -#! /usr/bin/env perl |
| 1154 | -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. |
| 1155 | +#!/usr/bin/env perl |
| 1156 | +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
| 1157 | # |
| 1158 | -# Licensed under the OpenSSL license (the "License"). You may not use |
| 1159 | -# this file except in compliance with the License. You can obtain a copy |
| 1160 | -# in the file LICENSE in the source distribution or at |
| 1161 | -# https://www.openssl.org/source/license.html |
| 1162 | - |
| 1163 | +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. |
| 1164 | +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 1165 | +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. |
| 1166 | +# |
| 1167 | +# This code is taken from the OpenSSL project but the author, Andy Polyakov, |
| 1168 | +# has relicensed it under the licenses specified in the SPDX header above. |
| 1169 | +# The original headers, including the original license headers, are |
| 1170 | +# included below for completeness. |
| 1171 | # |
| 1172 | # ==================================================================== |
| 1173 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 1174 | @@ -32,7 +35,7 @@ |
| 1175 | # Skylake-X system performance. Since we are likely to suppress |
| 1176 | # AVX512F capability flag [at least on Skylake-X], conversion serves |
| 1177 | # as kind of "investment protection". Note that next *lake processor, |
| 1178 | -# Cannolake, has AVX512IFMA code path to execute... |
| 1179 | +# Cannonlake, has AVX512IFMA code path to execute... |
| 1180 | # |
| 1181 | # Numbers are cycles per processed byte with poly1305_blocks alone, |
| 1182 | # measured with rdtsc at fixed clock frequency. |
| 1183 | @@ -68,39 +71,114 @@ $output = shift; |
| 1184 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 1185 | |
| 1186 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| 1187 | +$kernel=0; $kernel=1 if (!$flavour && !$output); |
| 1188 | |
| 1189 | -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 1190 | -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 1191 | -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 1192 | -die "can't locate x86_64-xlate.pl"; |
| 1193 | - |
| 1194 | -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
| 1195 | - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { |
| 1196 | - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); |
| 1197 | +if (!$kernel) { |
| 1198 | + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 1199 | + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 1200 | + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 1201 | + die "can't locate x86_64-xlate.pl"; |
| 1202 | + |
| 1203 | + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| 1204 | + *STDOUT=*OUT; |
| 1205 | + |
| 1206 | + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` |
| 1207 | + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { |
| 1208 | + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); |
| 1209 | + } |
| 1210 | + |
| 1211 | + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && |
| 1212 | + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
| 1213 | + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); |
| 1214 | + $avx += 1 if ($1==2.11 && $2>=8); |
| 1215 | + } |
| 1216 | + |
| 1217 | + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && |
| 1218 | + `ml64 2>&1` =~ /Version ([0-9]+)\./) { |
| 1219 | + $avx = ($1>=10) + ($1>=11); |
| 1220 | + } |
| 1221 | + |
| 1222 | + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { |
| 1223 | + $avx = ($2>=3.0) + ($2>3.0); |
| 1224 | + } |
| 1225 | +} else { |
| 1226 | + $avx = 4; # The kernel uses ifdefs for this. |
| 1227 | } |
| 1228 | |
| 1229 | -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && |
| 1230 | - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
| 1231 | - $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); |
| 1232 | - $avx += 2 if ($1==2.11 && $2>=8); |
| 1233 | +sub declare_function() { |
| 1234 | + my ($name, $align, $nargs) = @_; |
| 1235 | + if($kernel) { |
| 1236 | + $code .= ".align $align\n"; |
| 1237 | + $code .= "ENTRY($name)\n"; |
| 1238 | + $code .= ".L$name:\n"; |
| 1239 | + } else { |
| 1240 | + $code .= ".globl $name\n"; |
| 1241 | + $code .= ".type $name,\@function,$nargs\n"; |
| 1242 | + $code .= ".align $align\n"; |
| 1243 | + $code .= "$name:\n"; |
| 1244 | + } |
| 1245 | } |
| 1246 | |
| 1247 | -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && |
| 1248 | - `ml64 2>&1` =~ /Version ([0-9]+)\./) { |
| 1249 | - $avx = ($1>=10) + ($1>=12); |
| 1250 | +sub end_function() { |
| 1251 | + my ($name) = @_; |
| 1252 | + if($kernel) { |
| 1253 | + $code .= "ENDPROC($name)\n"; |
| 1254 | + } else { |
| 1255 | + $code .= ".size $name,.-$name\n"; |
| 1256 | + } |
| 1257 | } |
| 1258 | |
| 1259 | -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { |
| 1260 | - $avx = ($2>=3.0) + ($2>3.0); |
| 1261 | -} |
| 1262 | +$code.=<<___ if $kernel; |
| 1263 | +#include <linux/linkage.h> |
| 1264 | +___ |
| 1265 | |
| 1266 | -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| 1267 | -*STDOUT=*OUT; |
| 1268 | +if ($avx) { |
| 1269 | +$code.=<<___ if $kernel; |
| 1270 | +.section .rodata |
| 1271 | +___ |
| 1272 | +$code.=<<___; |
| 1273 | +.align 64 |
| 1274 | +.Lconst: |
| 1275 | +.Lmask24: |
| 1276 | +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 |
| 1277 | +.L129: |
| 1278 | +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 |
| 1279 | +.Lmask26: |
| 1280 | +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 |
| 1281 | +.Lpermd_avx2: |
| 1282 | +.long 2,2,2,3,2,0,2,1 |
| 1283 | +.Lpermd_avx512: |
| 1284 | +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 |
| 1285 | + |
| 1286 | +.L2_44_inp_permd: |
| 1287 | +.long 0,1,1,2,2,3,7,7 |
| 1288 | +.L2_44_inp_shift: |
| 1289 | +.quad 0,12,24,64 |
| 1290 | +.L2_44_mask: |
| 1291 | +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff |
| 1292 | +.L2_44_shift_rgt: |
| 1293 | +.quad 44,44,42,64 |
| 1294 | +.L2_44_shift_lft: |
| 1295 | +.quad 8,8,10,64 |
| 1296 | + |
| 1297 | +.align 64 |
| 1298 | +.Lx_mask44: |
| 1299 | +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| 1300 | +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| 1301 | +.Lx_mask42: |
| 1302 | +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| 1303 | +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| 1304 | +___ |
| 1305 | +} |
| 1306 | +$code.=<<___ if (!$kernel); |
| 1307 | +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 1308 | +.align 16 |
| 1309 | +___ |
| 1310 | |
| 1311 | my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); |
| 1312 | my ($mac,$nonce)=($inp,$len); # *_emit arguments |
| 1313 | -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); |
| 1314 | -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); |
| 1315 | +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); |
| 1316 | +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); |
| 1317 | |
| 1318 | sub poly1305_iteration { |
| 1319 | # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 |
| 1320 | @@ -155,19 +233,19 @@ ___ |
| 1321 | |
| 1322 | $code.=<<___; |
| 1323 | .text |
| 1324 | - |
| 1325 | +___ |
| 1326 | +$code.=<<___ if (!$kernel); |
| 1327 | .extern OPENSSL_ia32cap_P |
| 1328 | |
| 1329 | -.globl poly1305_init |
| 1330 | -.hidden poly1305_init |
| 1331 | -.globl poly1305_blocks |
| 1332 | -.hidden poly1305_blocks |
| 1333 | -.globl poly1305_emit |
| 1334 | -.hidden poly1305_emit |
| 1335 | - |
| 1336 | -.type poly1305_init,\@function,3 |
| 1337 | -.align 32 |
| 1338 | -poly1305_init: |
| 1339 | +.globl poly1305_init_x86_64 |
| 1340 | +.hidden poly1305_init_x86_64 |
| 1341 | +.globl poly1305_blocks_x86_64 |
| 1342 | +.hidden poly1305_blocks_x86_64 |
| 1343 | +.globl poly1305_emit_x86_64 |
| 1344 | +.hidden poly1305_emit_x86_64 |
| 1345 | +___ |
| 1346 | +&declare_function("poly1305_init_x86_64", 32, 3); |
| 1347 | +$code.=<<___; |
| 1348 | xor %rax,%rax |
| 1349 | mov %rax,0($ctx) # initialize hash value |
| 1350 | mov %rax,8($ctx) |
| 1351 | @@ -175,11 +253,12 @@ poly1305_init: |
| 1352 | |
| 1353 | cmp \$0,$inp |
| 1354 | je .Lno_key |
| 1355 | - |
| 1356 | - lea poly1305_blocks(%rip),%r10 |
| 1357 | - lea poly1305_emit(%rip),%r11 |
| 1358 | ___ |
| 1359 | -$code.=<<___ if ($avx); |
| 1360 | +$code.=<<___ if (!$kernel); |
| 1361 | + lea poly1305_blocks_x86_64(%rip),%r10 |
| 1362 | + lea poly1305_emit_x86_64(%rip),%r11 |
| 1363 | +___ |
| 1364 | +$code.=<<___ if (!$kernel && $avx); |
| 1365 | mov OPENSSL_ia32cap_P+4(%rip),%r9 |
| 1366 | lea poly1305_blocks_avx(%rip),%rax |
| 1367 | lea poly1305_emit_avx(%rip),%rcx |
| 1368 | @@ -187,12 +266,12 @@ $code.=<<___ if ($avx); |
| 1369 | cmovc %rax,%r10 |
| 1370 | cmovc %rcx,%r11 |
| 1371 | ___ |
| 1372 | -$code.=<<___ if ($avx>1); |
| 1373 | +$code.=<<___ if (!$kernel && $avx>1); |
| 1374 | lea poly1305_blocks_avx2(%rip),%rax |
| 1375 | bt \$`5+32`,%r9 # AVX2? |
| 1376 | cmovc %rax,%r10 |
| 1377 | ___ |
| 1378 | -$code.=<<___ if ($avx>3); |
| 1379 | +$code.=<<___ if (!$kernel && $avx>3); |
| 1380 | mov \$`(1<<31|1<<21|1<<16)`,%rax |
| 1381 | shr \$32,%r9 |
| 1382 | and %rax,%r9 |
| 1383 | @@ -207,11 +286,11 @@ $code.=<<___; |
| 1384 | mov %rax,24($ctx) |
| 1385 | mov %rcx,32($ctx) |
| 1386 | ___ |
| 1387 | -$code.=<<___ if ($flavour !~ /elf32/); |
| 1388 | +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); |
| 1389 | mov %r10,0(%rdx) |
| 1390 | mov %r11,8(%rdx) |
| 1391 | ___ |
| 1392 | -$code.=<<___ if ($flavour =~ /elf32/); |
| 1393 | +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); |
| 1394 | mov %r10d,0(%rdx) |
| 1395 | mov %r11d,4(%rdx) |
| 1396 | ___ |
| 1397 | @@ -219,11 +298,11 @@ $code.=<<___; |
| 1398 | mov \$1,%eax |
| 1399 | .Lno_key: |
| 1400 | ret |
| 1401 | -.size poly1305_init,.-poly1305_init |
| 1402 | +___ |
| 1403 | +&end_function("poly1305_init_x86_64"); |
| 1404 | |
| 1405 | -.type poly1305_blocks,\@function,4 |
| 1406 | -.align 32 |
| 1407 | -poly1305_blocks: |
| 1408 | +&declare_function("poly1305_blocks_x86_64", 32, 4); |
| 1409 | +$code.=<<___; |
| 1410 | .cfi_startproc |
| 1411 | .Lblocks: |
| 1412 | shr \$4,$len |
| 1413 | @@ -231,8 +310,6 @@ poly1305_blocks: |
| 1414 | |
| 1415 | push %rbx |
| 1416 | .cfi_push %rbx |
| 1417 | - push %rbp |
| 1418 | -.cfi_push %rbp |
| 1419 | push %r12 |
| 1420 | .cfi_push %r12 |
| 1421 | push %r13 |
| 1422 | @@ -241,6 +318,8 @@ poly1305_blocks: |
| 1423 | .cfi_push %r14 |
| 1424 | push %r15 |
| 1425 | .cfi_push %r15 |
| 1426 | + push $ctx |
| 1427 | +.cfi_push $ctx |
| 1428 | .Lblocks_body: |
| 1429 | |
| 1430 | mov $len,%r15 # reassign $len |
| 1431 | @@ -265,26 +344,29 @@ poly1305_blocks: |
| 1432 | lea 16($inp),$inp |
| 1433 | adc $padbit,$h2 |
| 1434 | ___ |
| 1435 | + |
| 1436 | &poly1305_iteration(); |
| 1437 | + |
| 1438 | $code.=<<___; |
| 1439 | mov $r1,%rax |
| 1440 | dec %r15 # len-=16 |
| 1441 | jnz .Loop |
| 1442 | |
| 1443 | + mov 0(%rsp),$ctx |
| 1444 | +.cfi_restore $ctx |
| 1445 | + |
| 1446 | mov $h0,0($ctx) # store hash value |
| 1447 | mov $h1,8($ctx) |
| 1448 | mov $h2,16($ctx) |
| 1449 | |
| 1450 | - mov 0(%rsp),%r15 |
| 1451 | + mov 8(%rsp),%r15 |
| 1452 | .cfi_restore %r15 |
| 1453 | - mov 8(%rsp),%r14 |
| 1454 | + mov 16(%rsp),%r14 |
| 1455 | .cfi_restore %r14 |
| 1456 | - mov 16(%rsp),%r13 |
| 1457 | + mov 24(%rsp),%r13 |
| 1458 | .cfi_restore %r13 |
| 1459 | - mov 24(%rsp),%r12 |
| 1460 | + mov 32(%rsp),%r12 |
| 1461 | .cfi_restore %r12 |
| 1462 | - mov 32(%rsp),%rbp |
| 1463 | -.cfi_restore %rbp |
| 1464 | mov 40(%rsp),%rbx |
| 1465 | .cfi_restore %rbx |
| 1466 | lea 48(%rsp),%rsp |
| 1467 | @@ -293,11 +375,11 @@ $code.=<<___; |
| 1468 | .Lblocks_epilogue: |
| 1469 | ret |
| 1470 | .cfi_endproc |
| 1471 | -.size poly1305_blocks,.-poly1305_blocks |
| 1472 | +___ |
| 1473 | +&end_function("poly1305_blocks_x86_64"); |
| 1474 | |
| 1475 | -.type poly1305_emit,\@function,3 |
| 1476 | -.align 32 |
| 1477 | -poly1305_emit: |
| 1478 | +&declare_function("poly1305_emit_x86_64", 32, 3); |
| 1479 | +$code.=<<___; |
| 1480 | .Lemit: |
| 1481 | mov 0($ctx),%r8 # load hash value |
| 1482 | mov 8($ctx),%r9 |
| 1483 | @@ -318,10 +400,14 @@ poly1305_emit: |
| 1484 | mov %rcx,8($mac) |
| 1485 | |
| 1486 | ret |
| 1487 | -.size poly1305_emit,.-poly1305_emit |
| 1488 | ___ |
| 1489 | +&end_function("poly1305_emit_x86_64"); |
| 1490 | if ($avx) { |
| 1491 | |
| 1492 | +if($kernel) { |
| 1493 | + $code .= "#ifdef CONFIG_AS_AVX\n"; |
| 1494 | +} |
| 1495 | + |
| 1496 | ######################################################################## |
| 1497 | # Layout of opaque area is following. |
| 1498 | # |
| 1499 | @@ -342,15 +428,19 @@ $code.=<<___; |
| 1500 | .type __poly1305_block,\@abi-omnipotent |
| 1501 | .align 32 |
| 1502 | __poly1305_block: |
| 1503 | + push $ctx |
| 1504 | ___ |
| 1505 | &poly1305_iteration(); |
| 1506 | $code.=<<___; |
| 1507 | + pop $ctx |
| 1508 | ret |
| 1509 | .size __poly1305_block,.-__poly1305_block |
| 1510 | |
| 1511 | .type __poly1305_init_avx,\@abi-omnipotent |
| 1512 | .align 32 |
| 1513 | __poly1305_init_avx: |
| 1514 | + push %rbp |
| 1515 | + mov %rsp,%rbp |
| 1516 | mov $r0,$h0 |
| 1517 | mov $r1,$h1 |
| 1518 | xor $h2,$h2 |
| 1519 | @@ -507,12 +597,13 @@ __poly1305_init_avx: |
| 1520 | mov $d1#d,`16*8+8-64`($ctx) |
| 1521 | |
| 1522 | lea -48-64($ctx),$ctx # size [de-]optimization |
| 1523 | + pop %rbp |
| 1524 | ret |
| 1525 | .size __poly1305_init_avx,.-__poly1305_init_avx |
| 1526 | +___ |
| 1527 | |
| 1528 | -.type poly1305_blocks_avx,\@function,4 |
| 1529 | -.align 32 |
| 1530 | -poly1305_blocks_avx: |
| 1531 | +&declare_function("poly1305_blocks_avx", 32, 4); |
| 1532 | +$code.=<<___; |
| 1533 | .cfi_startproc |
| 1534 | mov 20($ctx),%r8d # is_base2_26 |
| 1535 | cmp \$128,$len |
| 1536 | @@ -532,10 +623,11 @@ poly1305_blocks_avx: |
| 1537 | test \$31,$len |
| 1538 | jz .Leven_avx |
| 1539 | |
| 1540 | - push %rbx |
| 1541 | -.cfi_push %rbx |
| 1542 | push %rbp |
| 1543 | .cfi_push %rbp |
| 1544 | + mov %rsp,%rbp |
| 1545 | + push %rbx |
| 1546 | +.cfi_push %rbx |
| 1547 | push %r12 |
| 1548 | .cfi_push %r12 |
| 1549 | push %r13 |
| 1550 | @@ -645,20 +737,18 @@ poly1305_blocks_avx: |
| 1551 | mov $h2#d,16($ctx) |
| 1552 | .align 16 |
| 1553 | .Ldone_avx: |
| 1554 | - mov 0(%rsp),%r15 |
| 1555 | + pop %r15 |
| 1556 | .cfi_restore %r15 |
| 1557 | - mov 8(%rsp),%r14 |
| 1558 | + pop %r14 |
| 1559 | .cfi_restore %r14 |
| 1560 | - mov 16(%rsp),%r13 |
| 1561 | + pop %r13 |
| 1562 | .cfi_restore %r13 |
| 1563 | - mov 24(%rsp),%r12 |
| 1564 | + pop %r12 |
| 1565 | .cfi_restore %r12 |
| 1566 | - mov 32(%rsp),%rbp |
| 1567 | -.cfi_restore %rbp |
| 1568 | - mov 40(%rsp),%rbx |
| 1569 | + pop %rbx |
| 1570 | .cfi_restore %rbx |
| 1571 | - lea 48(%rsp),%rsp |
| 1572 | -.cfi_adjust_cfa_offset -48 |
| 1573 | + pop %rbp |
| 1574 | +.cfi_restore %rbp |
| 1575 | .Lno_data_avx: |
| 1576 | .Lblocks_avx_epilogue: |
| 1577 | ret |
| 1578 | @@ -667,10 +757,11 @@ poly1305_blocks_avx: |
| 1579 | .align 32 |
| 1580 | .Lbase2_64_avx: |
| 1581 | .cfi_startproc |
| 1582 | - push %rbx |
| 1583 | -.cfi_push %rbx |
| 1584 | push %rbp |
| 1585 | .cfi_push %rbp |
| 1586 | + mov %rsp,%rbp |
| 1587 | + push %rbx |
| 1588 | +.cfi_push %rbx |
| 1589 | push %r12 |
| 1590 | .cfi_push %r12 |
| 1591 | push %r13 |
| 1592 | @@ -736,22 +827,18 @@ poly1305_blocks_avx: |
| 1593 | |
| 1594 | .Lproceed_avx: |
| 1595 | mov %r15,$len |
| 1596 | - |
| 1597 | - mov 0(%rsp),%r15 |
| 1598 | + pop %r15 |
| 1599 | .cfi_restore %r15 |
| 1600 | - mov 8(%rsp),%r14 |
| 1601 | + pop %r14 |
| 1602 | .cfi_restore %r14 |
| 1603 | - mov 16(%rsp),%r13 |
| 1604 | + pop %r13 |
| 1605 | .cfi_restore %r13 |
| 1606 | - mov 24(%rsp),%r12 |
| 1607 | + pop %r12 |
| 1608 | .cfi_restore %r12 |
| 1609 | - mov 32(%rsp),%rbp |
| 1610 | -.cfi_restore %rbp |
| 1611 | - mov 40(%rsp),%rbx |
| 1612 | + pop %rbx |
| 1613 | .cfi_restore %rbx |
| 1614 | - lea 48(%rsp),%rax |
| 1615 | - lea 48(%rsp),%rsp |
| 1616 | -.cfi_adjust_cfa_offset -48 |
| 1617 | + pop %rbp |
| 1618 | +.cfi_restore %rbp |
| 1619 | .Lbase2_64_avx_epilogue: |
| 1620 | jmp .Ldo_avx |
| 1621 | .cfi_endproc |
| 1622 | @@ -768,8 +855,11 @@ poly1305_blocks_avx: |
| 1623 | .Ldo_avx: |
| 1624 | ___ |
| 1625 | $code.=<<___ if (!$win64); |
| 1626 | + lea 8(%rsp),%r10 |
| 1627 | +.cfi_def_cfa_register %r10 |
| 1628 | + and \$-32,%rsp |
| 1629 | + sub \$-8,%rsp |
| 1630 | lea -0x58(%rsp),%r11 |
| 1631 | -.cfi_def_cfa %r11,0x60 |
| 1632 | sub \$0x178,%rsp |
| 1633 | ___ |
| 1634 | $code.=<<___ if ($win64); |
| 1635 | @@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64); |
| 1636 | .Ldo_avx_epilogue: |
| 1637 | ___ |
| 1638 | $code.=<<___ if (!$win64); |
| 1639 | - lea 0x58(%r11),%rsp |
| 1640 | -.cfi_def_cfa %rsp,8 |
| 1641 | + lea -8(%r10),%rsp |
| 1642 | +.cfi_def_cfa_register %rsp |
| 1643 | ___ |
| 1644 | $code.=<<___; |
| 1645 | vzeroupper |
| 1646 | ret |
| 1647 | .cfi_endproc |
| 1648 | -.size poly1305_blocks_avx,.-poly1305_blocks_avx |
| 1649 | +___ |
| 1650 | +&end_function("poly1305_blocks_avx"); |
| 1651 | |
| 1652 | -.type poly1305_emit_avx,\@function,3 |
| 1653 | -.align 32 |
| 1654 | -poly1305_emit_avx: |
| 1655 | +&declare_function("poly1305_emit_avx", 32, 3); |
| 1656 | +$code.=<<___; |
| 1657 | cmpl \$0,20($ctx) # is_base2_26? |
| 1658 | je .Lemit |
| 1659 | |
| 1660 | @@ -1423,41 +1513,51 @@ poly1305_emit_avx: |
| 1661 | mov %rcx,8($mac) |
| 1662 | |
| 1663 | ret |
| 1664 | -.size poly1305_emit_avx,.-poly1305_emit_avx |
| 1665 | ___ |
| 1666 | +&end_function("poly1305_emit_avx"); |
| 1667 | + |
| 1668 | +if ($kernel) { |
| 1669 | + $code .= "#endif\n"; |
| 1670 | +} |
| 1671 | |
| 1672 | if ($avx>1) { |
| 1673 | + |
| 1674 | +if ($kernel) { |
| 1675 | + $code .= "#ifdef CONFIG_AS_AVX2\n"; |
| 1676 | +} |
| 1677 | + |
| 1678 | my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = |
| 1679 | map("%ymm$_",(0..15)); |
| 1680 | my $S4=$MASK; |
| 1681 | |
| 1682 | +sub poly1305_blocks_avxN { |
| 1683 | + my ($avx512) = @_; |
| 1684 | + my $suffix = $avx512 ? "_avx512" : ""; |
| 1685 | $code.=<<___; |
| 1686 | -.type poly1305_blocks_avx2,\@function,4 |
| 1687 | -.align 32 |
| 1688 | -poly1305_blocks_avx2: |
| 1689 | .cfi_startproc |
| 1690 | mov 20($ctx),%r8d # is_base2_26 |
| 1691 | cmp \$128,$len |
| 1692 | - jae .Lblocks_avx2 |
| 1693 | + jae .Lblocks_avx2$suffix |
| 1694 | test %r8d,%r8d |
| 1695 | jz .Lblocks |
| 1696 | |
| 1697 | -.Lblocks_avx2: |
| 1698 | +.Lblocks_avx2$suffix: |
| 1699 | and \$-16,$len |
| 1700 | - jz .Lno_data_avx2 |
| 1701 | + jz .Lno_data_avx2$suffix |
| 1702 | |
| 1703 | vzeroupper |
| 1704 | |
| 1705 | test %r8d,%r8d |
| 1706 | - jz .Lbase2_64_avx2 |
| 1707 | + jz .Lbase2_64_avx2$suffix |
| 1708 | |
| 1709 | test \$63,$len |
| 1710 | - jz .Leven_avx2 |
| 1711 | + jz .Leven_avx2$suffix |
| 1712 | |
| 1713 | - push %rbx |
| 1714 | -.cfi_push %rbx |
| 1715 | push %rbp |
| 1716 | .cfi_push %rbp |
| 1717 | + mov %rsp,%rbp |
| 1718 | + push %rbx |
| 1719 | +.cfi_push %rbx |
| 1720 | push %r12 |
| 1721 | .cfi_push %r12 |
| 1722 | push %r13 |
| 1723 | @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2: |
| 1724 | .cfi_push %r14 |
| 1725 | push %r15 |
| 1726 | .cfi_push %r15 |
| 1727 | -.Lblocks_avx2_body: |
| 1728 | +.Lblocks_avx2_body$suffix: |
| 1729 | |
| 1730 | mov $len,%r15 # reassign $len |
| 1731 | |
| 1732 | @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2: |
| 1733 | shr \$2,$s1 |
| 1734 | add $r1,$s1 # s1 = r1 + (r1 >> 2) |
| 1735 | |
| 1736 | -.Lbase2_26_pre_avx2: |
| 1737 | +.Lbase2_26_pre_avx2$suffix: |
| 1738 | add 0($inp),$h0 # accumulate input |
| 1739 | adc 8($inp),$h1 |
| 1740 | lea 16($inp),$inp |
| 1741 | @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2: |
| 1742 | mov $r1,%rax |
| 1743 | |
| 1744 | test \$63,%r15 |
| 1745 | - jnz .Lbase2_26_pre_avx2 |
| 1746 | + jnz .Lbase2_26_pre_avx2$suffix |
| 1747 | |
| 1748 | test $padbit,$padbit # if $padbit is zero, |
| 1749 | - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format |
| 1750 | + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format |
| 1751 | |
| 1752 | ################################# base 2^64 -> base 2^26 |
| 1753 | mov $h0,%rax |
| 1754 | @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2: |
| 1755 | or $r1,$h2 # h[4] |
| 1756 | |
| 1757 | test %r15,%r15 |
| 1758 | - jz .Lstore_base2_26_avx2 |
| 1759 | + jz .Lstore_base2_26_avx2$suffix |
| 1760 | |
| 1761 | vmovd %rax#d,%x#$H0 |
| 1762 | vmovd %rdx#d,%x#$H1 |
| 1763 | vmovd $h0#d,%x#$H2 |
| 1764 | vmovd $h1#d,%x#$H3 |
| 1765 | vmovd $h2#d,%x#$H4 |
| 1766 | - jmp .Lproceed_avx2 |
| 1767 | + jmp .Lproceed_avx2$suffix |
| 1768 | |
| 1769 | .align 32 |
| 1770 | -.Lstore_base2_64_avx2: |
| 1771 | +.Lstore_base2_64_avx2$suffix: |
| 1772 | mov $h0,0($ctx) |
| 1773 | mov $h1,8($ctx) |
| 1774 | mov $h2,16($ctx) # note that is_base2_26 is zeroed |
| 1775 | - jmp .Ldone_avx2 |
| 1776 | + jmp .Ldone_avx2$suffix |
| 1777 | |
| 1778 | .align 16 |
| 1779 | -.Lstore_base2_26_avx2: |
| 1780 | +.Lstore_base2_26_avx2$suffix: |
| 1781 | mov %rax#d,0($ctx) # store hash value base 2^26 |
| 1782 | mov %rdx#d,4($ctx) |
| 1783 | mov $h0#d,8($ctx) |
| 1784 | mov $h1#d,12($ctx) |
| 1785 | mov $h2#d,16($ctx) |
| 1786 | .align 16 |
| 1787 | -.Ldone_avx2: |
| 1788 | - mov 0(%rsp),%r15 |
| 1789 | +.Ldone_avx2$suffix: |
| 1790 | + pop %r15 |
| 1791 | .cfi_restore %r15 |
| 1792 | - mov 8(%rsp),%r14 |
| 1793 | + pop %r14 |
| 1794 | .cfi_restore %r14 |
| 1795 | - mov 16(%rsp),%r13 |
| 1796 | + pop %r13 |
| 1797 | .cfi_restore %r13 |
| 1798 | - mov 24(%rsp),%r12 |
| 1799 | + pop %r12 |
| 1800 | .cfi_restore %r12 |
| 1801 | - mov 32(%rsp),%rbp |
| 1802 | -.cfi_restore %rbp |
| 1803 | - mov 40(%rsp),%rbx |
| 1804 | + pop %rbx |
| 1805 | .cfi_restore %rbx |
| 1806 | - lea 48(%rsp),%rsp |
| 1807 | -.cfi_adjust_cfa_offset -48 |
| 1808 | -.Lno_data_avx2: |
| 1809 | -.Lblocks_avx2_epilogue: |
| 1810 | + pop %rbp |
| 1811 | +.cfi_restore %rbp |
| 1812 | +.Lno_data_avx2$suffix: |
| 1813 | +.Lblocks_avx2_epilogue$suffix: |
| 1814 | ret |
| 1815 | .cfi_endproc |
| 1816 | |
| 1817 | .align 32 |
| 1818 | -.Lbase2_64_avx2: |
| 1819 | +.Lbase2_64_avx2$suffix: |
| 1820 | .cfi_startproc |
| 1821 | - push %rbx |
| 1822 | -.cfi_push %rbx |
| 1823 | push %rbp |
| 1824 | .cfi_push %rbp |
| 1825 | + mov %rsp,%rbp |
| 1826 | + push %rbx |
| 1827 | +.cfi_push %rbx |
| 1828 | push %r12 |
| 1829 | .cfi_push %r12 |
| 1830 | push %r13 |
| 1831 | @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2: |
| 1832 | .cfi_push %r14 |
| 1833 | push %r15 |
| 1834 | .cfi_push %r15 |
| 1835 | -.Lbase2_64_avx2_body: |
| 1836 | +.Lbase2_64_avx2_body$suffix: |
| 1837 | |
| 1838 | mov $len,%r15 # reassign $len |
| 1839 | |
| 1840 | @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2: |
| 1841 | add $r1,$s1 # s1 = r1 + (r1 >> 2) |
| 1842 | |
| 1843 | test \$63,$len |
| 1844 | - jz .Linit_avx2 |
| 1845 | + jz .Linit_avx2$suffix |
| 1846 | |
| 1847 | -.Lbase2_64_pre_avx2: |
| 1848 | +.Lbase2_64_pre_avx2$suffix: |
| 1849 | add 0($inp),$h0 # accumulate input |
| 1850 | adc 8($inp),$h1 |
| 1851 | lea 16($inp),$inp |
| 1852 | @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2: |
| 1853 | mov $r1,%rax |
| 1854 | |
| 1855 | test \$63,%r15 |
| 1856 | - jnz .Lbase2_64_pre_avx2 |
| 1857 | + jnz .Lbase2_64_pre_avx2$suffix |
| 1858 | |
| 1859 | -.Linit_avx2: |
| 1860 | +.Linit_avx2$suffix: |
| 1861 | ################################# base 2^64 -> base 2^26 |
| 1862 | mov $h0,%rax |
| 1863 | mov $h0,%rdx |
| 1864 | @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2: |
| 1865 | |
| 1866 | call __poly1305_init_avx |
| 1867 | |
| 1868 | -.Lproceed_avx2: |
| 1869 | +.Lproceed_avx2$suffix: |
| 1870 | mov %r15,$len # restore $len |
| 1871 | - mov OPENSSL_ia32cap_P+8(%rip),%r10d |
| 1872 | +___ |
| 1873 | +$code.=<<___ if (!$kernel); |
| 1874 | + mov OPENSSL_ia32cap_P+8(%rip),%r9d |
| 1875 | mov \$`(1<<31|1<<30|1<<16)`,%r11d |
| 1876 | - |
| 1877 | - mov 0(%rsp),%r15 |
| 1878 | +___ |
| 1879 | +$code.=<<___; |
| 1880 | + pop %r15 |
| 1881 | .cfi_restore %r15 |
| 1882 | - mov 8(%rsp),%r14 |
| 1883 | + pop %r14 |
| 1884 | .cfi_restore %r14 |
| 1885 | - mov 16(%rsp),%r13 |
| 1886 | + pop %r13 |
| 1887 | .cfi_restore %r13 |
| 1888 | - mov 24(%rsp),%r12 |
| 1889 | + pop %r12 |
| 1890 | .cfi_restore %r12 |
| 1891 | - mov 32(%rsp),%rbp |
| 1892 | -.cfi_restore %rbp |
| 1893 | - mov 40(%rsp),%rbx |
| 1894 | + pop %rbx |
| 1895 | .cfi_restore %rbx |
| 1896 | - lea 48(%rsp),%rax |
| 1897 | - lea 48(%rsp),%rsp |
| 1898 | -.cfi_adjust_cfa_offset -48 |
| 1899 | -.Lbase2_64_avx2_epilogue: |
| 1900 | - jmp .Ldo_avx2 |
| 1901 | + pop %rbp |
| 1902 | +.cfi_restore %rbp |
| 1903 | +.Lbase2_64_avx2_epilogue$suffix: |
| 1904 | + jmp .Ldo_avx2$suffix |
| 1905 | .cfi_endproc |
| 1906 | |
| 1907 | .align 32 |
| 1908 | -.Leven_avx2: |
| 1909 | +.Leven_avx2$suffix: |
| 1910 | .cfi_startproc |
| 1911 | - mov OPENSSL_ia32cap_P+8(%rip),%r10d |
| 1912 | +___ |
| 1913 | +$code.=<<___ if (!$kernel); |
| 1914 | + mov OPENSSL_ia32cap_P+8(%rip),%r9d |
| 1915 | +___ |
| 1916 | +$code.=<<___; |
| 1917 | vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 |
| 1918 | vmovd 4*1($ctx),%x#$H1 |
| 1919 | vmovd 4*2($ctx),%x#$H2 |
| 1920 | vmovd 4*3($ctx),%x#$H3 |
| 1921 | vmovd 4*4($ctx),%x#$H4 |
| 1922 | |
| 1923 | -.Ldo_avx2: |
| 1924 | +.Ldo_avx2$suffix: |
| 1925 | ___ |
| 1926 | -$code.=<<___ if ($avx>2); |
| 1927 | +$code.=<<___ if (!$kernel && $avx>2); |
| 1928 | cmp \$512,$len |
| 1929 | jb .Lskip_avx512 |
| 1930 | - and %r11d,%r10d |
| 1931 | - test \$`1<<16`,%r10d # check for AVX512F |
| 1932 | + and %r11d,%r9d |
| 1933 | + test \$`1<<16`,%r9d # check for AVX512F |
| 1934 | jnz .Lblocks_avx512 |
| 1935 | -.Lskip_avx512: |
| 1936 | +.Lskip_avx512$suffix: |
| 1937 | +___ |
| 1938 | +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); |
| 1939 | + cmp \$512,$len |
| 1940 | + jae .Lblocks_avx512 |
| 1941 | ___ |
| 1942 | $code.=<<___ if (!$win64); |
| 1943 | - lea -8(%rsp),%r11 |
| 1944 | -.cfi_def_cfa %r11,16 |
| 1945 | + lea 8(%rsp),%r10 |
| 1946 | +.cfi_def_cfa_register %r10 |
| 1947 | sub \$0x128,%rsp |
| 1948 | ___ |
| 1949 | $code.=<<___ if ($win64); |
| 1950 | - lea -0xf8(%rsp),%r11 |
| 1951 | + lea 8(%rsp),%r10 |
| 1952 | sub \$0x1c8,%rsp |
| 1953 | - vmovdqa %xmm6,0x50(%r11) |
| 1954 | - vmovdqa %xmm7,0x60(%r11) |
| 1955 | - vmovdqa %xmm8,0x70(%r11) |
| 1956 | - vmovdqa %xmm9,0x80(%r11) |
| 1957 | - vmovdqa %xmm10,0x90(%r11) |
| 1958 | - vmovdqa %xmm11,0xa0(%r11) |
| 1959 | - vmovdqa %xmm12,0xb0(%r11) |
| 1960 | - vmovdqa %xmm13,0xc0(%r11) |
| 1961 | - vmovdqa %xmm14,0xd0(%r11) |
| 1962 | - vmovdqa %xmm15,0xe0(%r11) |
| 1963 | -.Ldo_avx2_body: |
| 1964 | + vmovdqa %xmm6,-0xb0(%r10) |
| 1965 | + vmovdqa %xmm7,-0xa0(%r10) |
| 1966 | + vmovdqa %xmm8,-0x90(%r10) |
| 1967 | + vmovdqa %xmm9,-0x80(%r10) |
| 1968 | + vmovdqa %xmm10,-0x70(%r10) |
| 1969 | + vmovdqa %xmm11,-0x60(%r10) |
| 1970 | + vmovdqa %xmm12,-0x50(%r10) |
| 1971 | + vmovdqa %xmm13,-0x40(%r10) |
| 1972 | + vmovdqa %xmm14,-0x30(%r10) |
| 1973 | + vmovdqa %xmm15,-0x20(%r10) |
| 1974 | +.Ldo_avx2_body$suffix: |
| 1975 | ___ |
| 1976 | $code.=<<___; |
| 1977 | lea .Lconst(%rip),%rcx |
| 1978 | @@ -1794,11 +1901,11 @@ $code.=<<___; |
| 1979 | |
| 1980 | vpaddq $H2,$T2,$H2 # accumulate input |
| 1981 | sub \$64,$len |
| 1982 | - jz .Ltail_avx2 |
| 1983 | - jmp .Loop_avx2 |
| 1984 | + jz .Ltail_avx2$suffix |
| 1985 | + jmp .Loop_avx2$suffix |
| 1986 | |
| 1987 | .align 32 |
| 1988 | -.Loop_avx2: |
| 1989 | +.Loop_avx2$suffix: |
| 1990 | ################################################################ |
| 1991 | # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 |
| 1992 | # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 |
| 1993 | @@ -1946,10 +2053,10 @@ $code.=<<___; |
| 1994 | vpor 32(%rcx),$T4,$T4 # padbit, yes, always |
| 1995 | |
| 1996 | sub \$64,$len |
| 1997 | - jnz .Loop_avx2 |
| 1998 | + jnz .Loop_avx2$suffix |
| 1999 | |
| 2000 | .byte 0x66,0x90 |
| 2001 | -.Ltail_avx2: |
| 2002 | +.Ltail_avx2$suffix: |
| 2003 | ################################################################ |
| 2004 | # while above multiplications were by r^4 in all lanes, in last |
| 2005 | # iteration we multiply least significant lane by r^4 and most |
| 2006 | @@ -2087,37 +2194,29 @@ $code.=<<___; |
| 2007 | vmovd %x#$H4,`4*4-48-64`($ctx) |
| 2008 | ___ |
| 2009 | $code.=<<___ if ($win64); |
| 2010 | - vmovdqa 0x50(%r11),%xmm6 |
| 2011 | - vmovdqa 0x60(%r11),%xmm7 |
| 2012 | - vmovdqa 0x70(%r11),%xmm8 |
| 2013 | - vmovdqa 0x80(%r11),%xmm9 |
| 2014 | - vmovdqa 0x90(%r11),%xmm10 |
| 2015 | - vmovdqa 0xa0(%r11),%xmm11 |
| 2016 | - vmovdqa 0xb0(%r11),%xmm12 |
| 2017 | - vmovdqa 0xc0(%r11),%xmm13 |
| 2018 | - vmovdqa 0xd0(%r11),%xmm14 |
| 2019 | - vmovdqa 0xe0(%r11),%xmm15 |
| 2020 | - lea 0xf8(%r11),%rsp |
| 2021 | -.Ldo_avx2_epilogue: |
| 2022 | + vmovdqa -0xb0(%r10),%xmm6 |
| 2023 | + vmovdqa -0xa0(%r10),%xmm7 |
| 2024 | + vmovdqa -0x90(%r10),%xmm8 |
| 2025 | + vmovdqa -0x80(%r10),%xmm9 |
| 2026 | + vmovdqa -0x70(%r10),%xmm10 |
| 2027 | + vmovdqa -0x60(%r10),%xmm11 |
| 2028 | + vmovdqa -0x50(%r10),%xmm12 |
| 2029 | + vmovdqa -0x40(%r10),%xmm13 |
| 2030 | + vmovdqa -0x30(%r10),%xmm14 |
| 2031 | + vmovdqa -0x20(%r10),%xmm15 |
| 2032 | + lea -8(%r10),%rsp |
| 2033 | +.Ldo_avx2_epilogue$suffix: |
| 2034 | ___ |
| 2035 | $code.=<<___ if (!$win64); |
| 2036 | - lea 8(%r11),%rsp |
| 2037 | -.cfi_def_cfa %rsp,8 |
| 2038 | + lea -8(%r10),%rsp |
| 2039 | +.cfi_def_cfa_register %rsp |
| 2040 | ___ |
| 2041 | $code.=<<___; |
| 2042 | vzeroupper |
| 2043 | ret |
| 2044 | .cfi_endproc |
| 2045 | -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 |
| 2046 | ___ |
| 2047 | -####################################################################### |
| 2048 | -if ($avx>2) { |
| 2049 | -# On entry we have input length divisible by 64. But since inner loop |
| 2050 | -# processes 128 bytes per iteration, cases when length is not divisible |
| 2051 | -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this |
| 2052 | -# reason stack layout is kept identical to poly1305_blocks_avx2. If not |
| 2053 | -# for this tail, we wouldn't have to even allocate stack frame... |
| 2054 | - |
| 2055 | +if($avx > 2 && $avx512) { |
| 2056 | my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); |
| 2057 | my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); |
| 2058 | my $PADBIT="%zmm30"; |
| 2059 | @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); |
| 2060 | map(s/%y/%z/,($MASK)); |
| 2061 | |
| 2062 | $code.=<<___; |
| 2063 | -.type poly1305_blocks_avx512,\@function,4 |
| 2064 | -.align 32 |
| 2065 | -poly1305_blocks_avx512: |
| 2066 | .cfi_startproc |
| 2067 | .Lblocks_avx512: |
| 2068 | mov \$15,%eax |
| 2069 | kmovw %eax,%k2 |
| 2070 | ___ |
| 2071 | $code.=<<___ if (!$win64); |
| 2072 | - lea -8(%rsp),%r11 |
| 2073 | -.cfi_def_cfa %r11,16 |
| 2074 | + lea 8(%rsp),%r10 |
| 2075 | +.cfi_def_cfa_register %r10 |
| 2076 | sub \$0x128,%rsp |
| 2077 | ___ |
| 2078 | $code.=<<___ if ($win64); |
| 2079 | - lea -0xf8(%rsp),%r11 |
| 2080 | + lea 8(%rsp),%r10 |
| 2081 | sub \$0x1c8,%rsp |
| 2082 | - vmovdqa %xmm6,0x50(%r11) |
| 2083 | - vmovdqa %xmm7,0x60(%r11) |
| 2084 | - vmovdqa %xmm8,0x70(%r11) |
| 2085 | - vmovdqa %xmm9,0x80(%r11) |
| 2086 | - vmovdqa %xmm10,0x90(%r11) |
| 2087 | - vmovdqa %xmm11,0xa0(%r11) |
| 2088 | - vmovdqa %xmm12,0xb0(%r11) |
| 2089 | - vmovdqa %xmm13,0xc0(%r11) |
| 2090 | - vmovdqa %xmm14,0xd0(%r11) |
| 2091 | - vmovdqa %xmm15,0xe0(%r11) |
| 2092 | + vmovdqa %xmm6,-0xb0(%r10) |
| 2093 | + vmovdqa %xmm7,-0xa0(%r10) |
| 2094 | + vmovdqa %xmm8,-0x90(%r10) |
| 2095 | + vmovdqa %xmm9,-0x80(%r10) |
| 2096 | + vmovdqa %xmm10,-0x70(%r10) |
| 2097 | + vmovdqa %xmm11,-0x60(%r10) |
| 2098 | + vmovdqa %xmm12,-0x50(%r10) |
| 2099 | + vmovdqa %xmm13,-0x40(%r10) |
| 2100 | + vmovdqa %xmm14,-0x30(%r10) |
| 2101 | + vmovdqa %xmm15,-0x20(%r10) |
| 2102 | .Ldo_avx512_body: |
| 2103 | ___ |
| 2104 | $code.=<<___; |
| 2105 | @@ -2679,7 +2775,7 @@ $code.=<<___; |
| 2106 | |
| 2107 | lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 |
| 2108 | add \$64,$len |
| 2109 | - jnz .Ltail_avx2 |
| 2110 | + jnz .Ltail_avx2$suffix |
| 2111 | |
| 2112 | vpsubq $T2,$H2,$H2 # undo input accumulation |
| 2113 | vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced |
| 2114 | @@ -2690,29 +2786,61 @@ $code.=<<___; |
| 2115 | vzeroall |
| 2116 | ___ |
| 2117 | $code.=<<___ if ($win64); |
| 2118 | - movdqa 0x50(%r11),%xmm6 |
| 2119 | - movdqa 0x60(%r11),%xmm7 |
| 2120 | - movdqa 0x70(%r11),%xmm8 |
| 2121 | - movdqa 0x80(%r11),%xmm9 |
| 2122 | - movdqa 0x90(%r11),%xmm10 |
| 2123 | - movdqa 0xa0(%r11),%xmm11 |
| 2124 | - movdqa 0xb0(%r11),%xmm12 |
| 2125 | - movdqa 0xc0(%r11),%xmm13 |
| 2126 | - movdqa 0xd0(%r11),%xmm14 |
| 2127 | - movdqa 0xe0(%r11),%xmm15 |
| 2128 | - lea 0xf8(%r11),%rsp |
| 2129 | + movdqa -0xb0(%r10),%xmm6 |
| 2130 | + movdqa -0xa0(%r10),%xmm7 |
| 2131 | + movdqa -0x90(%r10),%xmm8 |
| 2132 | + movdqa -0x80(%r10),%xmm9 |
| 2133 | + movdqa -0x70(%r10),%xmm10 |
| 2134 | + movdqa -0x60(%r10),%xmm11 |
| 2135 | + movdqa -0x50(%r10),%xmm12 |
| 2136 | + movdqa -0x40(%r10),%xmm13 |
| 2137 | + movdqa -0x30(%r10),%xmm14 |
| 2138 | + movdqa -0x20(%r10),%xmm15 |
| 2139 | + lea -8(%r10),%rsp |
| 2140 | .Ldo_avx512_epilogue: |
| 2141 | ___ |
| 2142 | $code.=<<___ if (!$win64); |
| 2143 | - lea 8(%r11),%rsp |
| 2144 | -.cfi_def_cfa %rsp,8 |
| 2145 | + lea -8(%r10),%rsp |
| 2146 | +.cfi_def_cfa_register %rsp |
| 2147 | ___ |
| 2148 | $code.=<<___; |
| 2149 | ret |
| 2150 | .cfi_endproc |
| 2151 | -.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 |
| 2152 | ___ |
| 2153 | -if ($avx>3) { |
| 2154 | + |
| 2155 | +} |
| 2156 | + |
| 2157 | +} |
| 2158 | + |
| 2159 | +&declare_function("poly1305_blocks_avx2", 32, 4); |
| 2160 | +poly1305_blocks_avxN(0); |
| 2161 | +&end_function("poly1305_blocks_avx2"); |
| 2162 | + |
| 2163 | +if($kernel) { |
| 2164 | + $code .= "#endif\n"; |
| 2165 | +} |
| 2166 | + |
| 2167 | +####################################################################### |
| 2168 | +if ($avx>2) { |
| 2169 | +# On entry we have input length divisible by 64. But since inner loop |
| 2170 | +# processes 128 bytes per iteration, cases when length is not divisible |
| 2171 | +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this |
| 2172 | +# reason stack layout is kept identical to poly1305_blocks_avx2. If not |
| 2173 | +# for this tail, we wouldn't have to even allocate stack frame... |
| 2174 | + |
| 2175 | +if($kernel) { |
| 2176 | + $code .= "#ifdef CONFIG_AS_AVX512\n"; |
| 2177 | +} |
| 2178 | + |
| 2179 | +&declare_function("poly1305_blocks_avx512", 32, 4); |
| 2180 | +poly1305_blocks_avxN(1); |
| 2181 | +&end_function("poly1305_blocks_avx512"); |
| 2182 | + |
| 2183 | +if ($kernel) { |
| 2184 | + $code .= "#endif\n"; |
| 2185 | +} |
| 2186 | + |
| 2187 | +if (!$kernel && $avx>3) { |
| 2188 | ######################################################################## |
| 2189 | # VPMADD52 version using 2^44 radix. |
| 2190 | # |
| 2191 | @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44: |
| 2192 | .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 |
| 2193 | ___ |
| 2194 | } } } |
| 2195 | -$code.=<<___; |
| 2196 | -.align 64 |
| 2197 | -.Lconst: |
| 2198 | -.Lmask24: |
| 2199 | -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 |
| 2200 | -.L129: |
| 2201 | -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 |
| 2202 | -.Lmask26: |
| 2203 | -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 |
| 2204 | -.Lpermd_avx2: |
| 2205 | -.long 2,2,2,3,2,0,2,1 |
| 2206 | -.Lpermd_avx512: |
| 2207 | -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 |
| 2208 | - |
| 2209 | -.L2_44_inp_permd: |
| 2210 | -.long 0,1,1,2,2,3,7,7 |
| 2211 | -.L2_44_inp_shift: |
| 2212 | -.quad 0,12,24,64 |
| 2213 | -.L2_44_mask: |
| 2214 | -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff |
| 2215 | -.L2_44_shift_rgt: |
| 2216 | -.quad 44,44,42,64 |
| 2217 | -.L2_44_shift_lft: |
| 2218 | -.quad 8,8,10,64 |
| 2219 | - |
| 2220 | -.align 64 |
| 2221 | -.Lx_mask44: |
| 2222 | -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| 2223 | -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff |
| 2224 | -.Lx_mask42: |
| 2225 | -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| 2226 | -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff |
| 2227 | -___ |
| 2228 | } |
| 2229 | -$code.=<<___; |
| 2230 | -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 2231 | -.align 16 |
| 2232 | -___ |
| 2233 | |
| 2234 | +if (!$kernel) |
| 2235 | { # chacha20-poly1305 helpers |
| 2236 | my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order |
| 2237 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order |
| 2238 | @@ -4038,17 +4130,17 @@ avx_handler: |
| 2239 | |
| 2240 | .section .pdata |
| 2241 | .align 4 |
| 2242 | - .rva .LSEH_begin_poly1305_init |
| 2243 | - .rva .LSEH_end_poly1305_init |
| 2244 | - .rva .LSEH_info_poly1305_init |
| 2245 | - |
| 2246 | - .rva .LSEH_begin_poly1305_blocks |
| 2247 | - .rva .LSEH_end_poly1305_blocks |
| 2248 | - .rva .LSEH_info_poly1305_blocks |
| 2249 | - |
| 2250 | - .rva .LSEH_begin_poly1305_emit |
| 2251 | - .rva .LSEH_end_poly1305_emit |
| 2252 | - .rva .LSEH_info_poly1305_emit |
| 2253 | + .rva .LSEH_begin_poly1305_init_x86_64 |
| 2254 | + .rva .LSEH_end_poly1305_init_x86_64 |
| 2255 | + .rva .LSEH_info_poly1305_init_x86_64 |
| 2256 | + |
| 2257 | + .rva .LSEH_begin_poly1305_blocks_x86_64 |
| 2258 | + .rva .LSEH_end_poly1305_blocks_x86_64 |
| 2259 | + .rva .LSEH_info_poly1305_blocks_x86_64 |
| 2260 | + |
| 2261 | + .rva .LSEH_begin_poly1305_emit_x86_64 |
| 2262 | + .rva .LSEH_end_poly1305_emit_x86_64 |
| 2263 | + .rva .LSEH_info_poly1305_emit_x86_64 |
| 2264 | ___ |
| 2265 | $code.=<<___ if ($avx); |
| 2266 | .rva .LSEH_begin_poly1305_blocks_avx |
| 2267 | @@ -4088,20 +4180,20 @@ ___ |
| 2268 | $code.=<<___; |
| 2269 | .section .xdata |
| 2270 | .align 8 |
| 2271 | -.LSEH_info_poly1305_init: |
| 2272 | +.LSEH_info_poly1305_init_x86_64: |
| 2273 | .byte 9,0,0,0 |
| 2274 | .rva se_handler |
| 2275 | - .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init |
| 2276 | + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 |
| 2277 | |
| 2278 | -.LSEH_info_poly1305_blocks: |
| 2279 | +.LSEH_info_poly1305_blocks_x86_64: |
| 2280 | .byte 9,0,0,0 |
| 2281 | .rva se_handler |
| 2282 | .rva .Lblocks_body,.Lblocks_epilogue |
| 2283 | |
| 2284 | -.LSEH_info_poly1305_emit: |
| 2285 | +.LSEH_info_poly1305_emit_x86_64: |
| 2286 | .byte 9,0,0,0 |
| 2287 | .rva se_handler |
| 2288 | - .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit |
| 2289 | + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 |
| 2290 | ___ |
| 2291 | $code.=<<___ if ($avx); |
| 2292 | .LSEH_info_poly1305_blocks_avx_1: |
| 2293 | @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2); |
| 2294 | ___ |
| 2295 | } |
| 2296 | |
| 2297 | +open SELF,$0; |
| 2298 | +while(<SELF>) { |
| 2299 | + next if (/^#!/); |
| 2300 | + last if (!s/^#/\/\// and !/^$/); |
| 2301 | + print; |
| 2302 | +} |
| 2303 | +close SELF; |
| 2304 | + |
| 2305 | foreach (split('\n',$code)) { |
| 2306 | s/\`([^\`]*)\`/eval($1)/ge; |
| 2307 | s/%r([a-z]+)#d/%e$1/g; |
| 2308 | s/%r([0-9]+)#d/%r$1d/g; |
| 2309 | s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; |
| 2310 | |
| 2311 | + if ($kernel) { |
| 2312 | + s/(^\.type.*),[0-9]+$/\1/; |
| 2313 | + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; |
| 2314 | + next if /^\.cfi.*/; |
| 2315 | + } |
| 2316 | + |
| 2317 | print $_,"\n"; |
| 2318 | } |
| 2319 | close STDOUT; |
| 2320 | --- a/arch/x86/crypto/poly1305_glue.c |
| 2321 | +++ b/arch/x86/crypto/poly1305_glue.c |
| 2322 | @@ -1,8 +1,6 @@ |
| 2323 | -// SPDX-License-Identifier: GPL-2.0-or-later |
| 2324 | +// SPDX-License-Identifier: GPL-2.0 OR MIT |
| 2325 | /* |
| 2326 | - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code |
| 2327 | - * |
| 2328 | - * Copyright (C) 2015 Martin Willi |
| 2329 | + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 2330 | */ |
| 2331 | |
| 2332 | #include <crypto/algapi.h> |
| 2333 | @@ -13,279 +11,170 @@ |
| 2334 | #include <linux/jump_label.h> |
| 2335 | #include <linux/kernel.h> |
| 2336 | #include <linux/module.h> |
| 2337 | +#include <asm/intel-family.h> |
| 2338 | #include <asm/simd.h> |
| 2339 | |
| 2340 | -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, |
| 2341 | - const u32 *r, unsigned int blocks); |
| 2342 | -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, |
| 2343 | - unsigned int blocks, const u32 *u); |
| 2344 | -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, |
| 2345 | - unsigned int blocks, const u32 *u); |
| 2346 | +asmlinkage void poly1305_init_x86_64(void *ctx, |
| 2347 | + const u8 key[POLY1305_KEY_SIZE]); |
| 2348 | +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, |
| 2349 | + const size_t len, const u32 padbit); |
| 2350 | +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |
| 2351 | + const u32 nonce[4]); |
| 2352 | +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |
| 2353 | + const u32 nonce[4]); |
| 2354 | +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, |
| 2355 | + const u32 padbit); |
| 2356 | +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, |
| 2357 | + const u32 padbit); |
| 2358 | +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, |
| 2359 | + const size_t len, const u32 padbit); |
| 2360 | |
| 2361 | -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); |
| 2362 | +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); |
| 2363 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); |
| 2364 | +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); |
| 2365 | |
| 2366 | -static inline u64 mlt(u64 a, u64 b) |
| 2367 | -{ |
| 2368 | - return a * b; |
| 2369 | -} |
| 2370 | - |
| 2371 | -static inline u32 sr(u64 v, u_char n) |
| 2372 | -{ |
| 2373 | - return v >> n; |
| 2374 | -} |
| 2375 | - |
| 2376 | -static inline u32 and(u32 v, u32 mask) |
| 2377 | -{ |
| 2378 | - return v & mask; |
| 2379 | -} |
| 2380 | - |
| 2381 | -static void poly1305_simd_mult(u32 *a, const u32 *b) |
| 2382 | -{ |
| 2383 | - u8 m[POLY1305_BLOCK_SIZE]; |
| 2384 | - |
| 2385 | - memset(m, 0, sizeof(m)); |
| 2386 | - /* The poly1305 block function adds a hi-bit to the accumulator which |
| 2387 | - * we don't need for key multiplication; compensate for it. */ |
| 2388 | - a[4] -= 1 << 24; |
| 2389 | - poly1305_block_sse2(a, m, b, 1); |
| 2390 | -} |
| 2391 | - |
| 2392 | -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key) |
| 2393 | -{ |
| 2394 | - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ |
| 2395 | - key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; |
| 2396 | - key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; |
| 2397 | - key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; |
| 2398 | - key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; |
| 2399 | - key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; |
| 2400 | -} |
| 2401 | +struct poly1305_arch_internal { |
| 2402 | + union { |
| 2403 | + struct { |
| 2404 | + u32 h[5]; |
| 2405 | + u32 is_base2_26; |
| 2406 | + }; |
| 2407 | + u64 hs[3]; |
| 2408 | + }; |
| 2409 | + u64 r[2]; |
| 2410 | + u64 pad; |
| 2411 | + struct { u32 r2, r1, r4, r3; } rn[9]; |
| 2412 | +}; |
| 2413 | |
| 2414 | -static void poly1305_integer_blocks(struct poly1305_state *state, |
| 2415 | - const struct poly1305_key *key, |
| 2416 | - const void *src, |
| 2417 | - unsigned int nblocks, u32 hibit) |
| 2418 | +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit |
| 2419 | + * the unfortunate situation of using AVX and then having to go back to scalar |
| 2420 | + * -- because the user is silly and has called the update function from two |
| 2421 | + * separate contexts -- then we need to convert back to the original base before |
| 2422 | + * proceeding. It is possible to reason that the initial reduction below is |
| 2423 | + * sufficient given the implementation invariants. However, for an avoidance of |
| 2424 | + * doubt and because this is not performance critical, we do the full reduction |
| 2425 | + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py |
| 2426 | + */ |
| 2427 | +static void convert_to_base2_64(void *ctx) |
| 2428 | { |
| 2429 | - u32 r0, r1, r2, r3, r4; |
| 2430 | - u32 s1, s2, s3, s4; |
| 2431 | - u32 h0, h1, h2, h3, h4; |
| 2432 | - u64 d0, d1, d2, d3, d4; |
| 2433 | + struct poly1305_arch_internal *state = ctx; |
| 2434 | + u32 cy; |
| 2435 | |
| 2436 | - if (!nblocks) |
| 2437 | + if (!state->is_base2_26) |
| 2438 | return; |
| 2439 | |
| 2440 | - r0 = key->r[0]; |
| 2441 | - r1 = key->r[1]; |
| 2442 | - r2 = key->r[2]; |
| 2443 | - r3 = key->r[3]; |
| 2444 | - r4 = key->r[4]; |
| 2445 | - |
| 2446 | - s1 = r1 * 5; |
| 2447 | - s2 = r2 * 5; |
| 2448 | - s3 = r3 * 5; |
| 2449 | - s4 = r4 * 5; |
| 2450 | - |
| 2451 | - h0 = state->h[0]; |
| 2452 | - h1 = state->h[1]; |
| 2453 | - h2 = state->h[2]; |
| 2454 | - h3 = state->h[3]; |
| 2455 | - h4 = state->h[4]; |
| 2456 | - |
| 2457 | - do { |
| 2458 | - /* h += m[i] */ |
| 2459 | - h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; |
| 2460 | - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; |
| 2461 | - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; |
| 2462 | - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; |
| 2463 | - h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24); |
| 2464 | - |
| 2465 | - /* h *= r */ |
| 2466 | - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + |
| 2467 | - mlt(h3, s2) + mlt(h4, s1); |
| 2468 | - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + |
| 2469 | - mlt(h3, s3) + mlt(h4, s2); |
| 2470 | - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + |
| 2471 | - mlt(h3, s4) + mlt(h4, s3); |
| 2472 | - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + |
| 2473 | - mlt(h3, r0) + mlt(h4, s4); |
| 2474 | - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + |
| 2475 | - mlt(h3, r1) + mlt(h4, r0); |
| 2476 | - |
| 2477 | - /* (partial) h %= p */ |
| 2478 | - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); |
| 2479 | - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); |
| 2480 | - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); |
| 2481 | - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); |
| 2482 | - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); |
| 2483 | - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; |
| 2484 | - |
| 2485 | - src += POLY1305_BLOCK_SIZE; |
| 2486 | - } while (--nblocks); |
| 2487 | - |
| 2488 | - state->h[0] = h0; |
| 2489 | - state->h[1] = h1; |
| 2490 | - state->h[2] = h2; |
| 2491 | - state->h[3] = h3; |
| 2492 | - state->h[4] = h4; |
| 2493 | -} |
| 2494 | - |
| 2495 | -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst) |
| 2496 | -{ |
| 2497 | - u32 h0, h1, h2, h3, h4; |
| 2498 | - u32 g0, g1, g2, g3, g4; |
| 2499 | - u32 mask; |
| 2500 | - |
| 2501 | - /* fully carry h */ |
| 2502 | - h0 = state->h[0]; |
| 2503 | - h1 = state->h[1]; |
| 2504 | - h2 = state->h[2]; |
| 2505 | - h3 = state->h[3]; |
| 2506 | - h4 = state->h[4]; |
| 2507 | - |
| 2508 | - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; |
| 2509 | - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; |
| 2510 | - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; |
| 2511 | - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; |
| 2512 | - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; |
| 2513 | - |
| 2514 | - /* compute h + -p */ |
| 2515 | - g0 = h0 + 5; |
| 2516 | - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; |
| 2517 | - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; |
| 2518 | - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; |
| 2519 | - g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff; |
| 2520 | - |
| 2521 | - /* select h if h < p, or h + -p if h >= p */ |
| 2522 | - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; |
| 2523 | - g0 &= mask; |
| 2524 | - g1 &= mask; |
| 2525 | - g2 &= mask; |
| 2526 | - g3 &= mask; |
| 2527 | - g4 &= mask; |
| 2528 | - mask = ~mask; |
| 2529 | - h0 = (h0 & mask) | g0; |
| 2530 | - h1 = (h1 & mask) | g1; |
| 2531 | - h2 = (h2 & mask) | g2; |
| 2532 | - h3 = (h3 & mask) | g3; |
| 2533 | - h4 = (h4 & mask) | g4; |
| 2534 | - |
| 2535 | - /* h = h % (2^128) */ |
| 2536 | - put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); |
| 2537 | - put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); |
| 2538 | - put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); |
| 2539 | - put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); |
| 2540 | -} |
| 2541 | - |
| 2542 | -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) |
| 2543 | -{ |
| 2544 | - poly1305_integer_setkey(desc->opaque_r, key); |
| 2545 | - desc->s[0] = get_unaligned_le32(key + 16); |
| 2546 | - desc->s[1] = get_unaligned_le32(key + 20); |
| 2547 | - desc->s[2] = get_unaligned_le32(key + 24); |
| 2548 | - desc->s[3] = get_unaligned_le32(key + 28); |
| 2549 | - poly1305_core_init(&desc->h); |
| 2550 | - desc->buflen = 0; |
| 2551 | - desc->sset = true; |
| 2552 | - desc->rset = 1; |
| 2553 | -} |
| 2554 | -EXPORT_SYMBOL_GPL(poly1305_init_arch); |
| 2555 | - |
| 2556 | -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, |
| 2557 | - const u8 *src, unsigned int srclen) |
| 2558 | -{ |
| 2559 | - if (!dctx->sset) { |
| 2560 | - if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { |
| 2561 | - poly1305_integer_setkey(dctx->r, src); |
| 2562 | - src += POLY1305_BLOCK_SIZE; |
| 2563 | - srclen -= POLY1305_BLOCK_SIZE; |
| 2564 | - dctx->rset = 1; |
| 2565 | - } |
| 2566 | - if (srclen >= POLY1305_BLOCK_SIZE) { |
| 2567 | - dctx->s[0] = get_unaligned_le32(src + 0); |
| 2568 | - dctx->s[1] = get_unaligned_le32(src + 4); |
| 2569 | - dctx->s[2] = get_unaligned_le32(src + 8); |
| 2570 | - dctx->s[3] = get_unaligned_le32(src + 12); |
| 2571 | - src += POLY1305_BLOCK_SIZE; |
| 2572 | - srclen -= POLY1305_BLOCK_SIZE; |
| 2573 | - dctx->sset = true; |
| 2574 | - } |
| 2575 | + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; |
| 2576 | + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; |
| 2577 | + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; |
| 2578 | + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; |
| 2579 | + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; |
| 2580 | + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); |
| 2581 | + state->hs[2] = state->h[4] >> 24; |
| 2582 | +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) |
| 2583 | + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); |
| 2584 | + state->hs[2] &= 3; |
| 2585 | + state->hs[0] += cy; |
| 2586 | + state->hs[1] += (cy = ULT(state->hs[0], cy)); |
| 2587 | + state->hs[2] += ULT(state->hs[1], cy); |
| 2588 | +#undef ULT |
| 2589 | + state->is_base2_26 = 0; |
| 2590 | +} |
| 2591 | + |
| 2592 | +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) |
| 2593 | +{ |
| 2594 | + poly1305_init_x86_64(ctx, key); |
| 2595 | +} |
| 2596 | + |
| 2597 | +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, |
| 2598 | + const u32 padbit) |
| 2599 | +{ |
| 2600 | + struct poly1305_arch_internal *state = ctx; |
| 2601 | + |
| 2602 | + /* SIMD disables preemption, so relax after processing each page. */ |
| 2603 | + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || |
| 2604 | + PAGE_SIZE % POLY1305_BLOCK_SIZE); |
| 2605 | + |
| 2606 | + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || |
| 2607 | + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || |
| 2608 | + !crypto_simd_usable()) { |
| 2609 | + convert_to_base2_64(ctx); |
| 2610 | + poly1305_blocks_x86_64(ctx, inp, len, padbit); |
| 2611 | + return; |
| 2612 | } |
| 2613 | - return srclen; |
| 2614 | -} |
| 2615 | |
| 2616 | -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, |
| 2617 | - const u8 *src, unsigned int srclen) |
| 2618 | -{ |
| 2619 | - unsigned int datalen; |
| 2620 | + for (;;) { |
| 2621 | + const size_t bytes = min_t(size_t, len, PAGE_SIZE); |
| 2622 | |
| 2623 | - if (unlikely(!dctx->sset)) { |
| 2624 | - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); |
| 2625 | - src += srclen - datalen; |
| 2626 | - srclen = datalen; |
| 2627 | - } |
| 2628 | - if (srclen >= POLY1305_BLOCK_SIZE) { |
| 2629 | - poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src, |
| 2630 | - srclen / POLY1305_BLOCK_SIZE, 1); |
| 2631 | - srclen %= POLY1305_BLOCK_SIZE; |
| 2632 | + kernel_fpu_begin(); |
| 2633 | + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) |
| 2634 | + poly1305_blocks_avx512(ctx, inp, bytes, padbit); |
| 2635 | + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) |
| 2636 | + poly1305_blocks_avx2(ctx, inp, bytes, padbit); |
| 2637 | + else |
| 2638 | + poly1305_blocks_avx(ctx, inp, bytes, padbit); |
| 2639 | + kernel_fpu_end(); |
| 2640 | + len -= bytes; |
| 2641 | + if (!len) |
| 2642 | + break; |
| 2643 | + inp += bytes; |
| 2644 | } |
| 2645 | - return srclen; |
| 2646 | } |
| 2647 | |
| 2648 | -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, |
| 2649 | - const u8 *src, unsigned int srclen) |
| 2650 | -{ |
| 2651 | - unsigned int blocks, datalen; |
| 2652 | +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], |
| 2653 | + const u32 nonce[4]) |
| 2654 | +{ |
| 2655 | + struct poly1305_arch_internal *state = ctx; |
| 2656 | + |
| 2657 | + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || |
| 2658 | + !state->is_base2_26 || !crypto_simd_usable()) { |
| 2659 | + convert_to_base2_64(ctx); |
| 2660 | + poly1305_emit_x86_64(ctx, mac, nonce); |
| 2661 | + } else |
| 2662 | + poly1305_emit_avx(ctx, mac, nonce); |
| 2663 | +} |
| 2664 | + |
| 2665 | +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) |
| 2666 | +{ |
| 2667 | + poly1305_simd_init(&dctx->h, key); |
| 2668 | + dctx->s[0] = get_unaligned_le32(&key[16]); |
| 2669 | + dctx->s[1] = get_unaligned_le32(&key[20]); |
| 2670 | + dctx->s[2] = get_unaligned_le32(&key[24]); |
| 2671 | + dctx->s[3] = get_unaligned_le32(&key[28]); |
| 2672 | + dctx->buflen = 0; |
| 2673 | + dctx->sset = true; |
| 2674 | +} |
| 2675 | +EXPORT_SYMBOL(poly1305_init_arch); |
| 2676 | |
| 2677 | +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, |
| 2678 | + const u8 *inp, unsigned int len) |
| 2679 | +{ |
| 2680 | + unsigned int acc = 0; |
| 2681 | if (unlikely(!dctx->sset)) { |
| 2682 | - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); |
| 2683 | - src += srclen - datalen; |
| 2684 | - srclen = datalen; |
| 2685 | - } |
| 2686 | - |
| 2687 | - if (IS_ENABLED(CONFIG_AS_AVX2) && |
| 2688 | - static_branch_likely(&poly1305_use_avx2) && |
| 2689 | - srclen >= POLY1305_BLOCK_SIZE * 4) { |
| 2690 | - if (unlikely(dctx->rset < 4)) { |
| 2691 | - if (dctx->rset < 2) { |
| 2692 | - dctx->r[1] = dctx->r[0]; |
| 2693 | - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); |
| 2694 | - } |
| 2695 | - dctx->r[2] = dctx->r[1]; |
| 2696 | - poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); |
| 2697 | - dctx->r[3] = dctx->r[2]; |
| 2698 | - poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); |
| 2699 | - dctx->rset = 4; |
| 2700 | + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { |
| 2701 | + poly1305_simd_init(&dctx->h, inp); |
| 2702 | + inp += POLY1305_BLOCK_SIZE; |
| 2703 | + len -= POLY1305_BLOCK_SIZE; |
| 2704 | + acc += POLY1305_BLOCK_SIZE; |
| 2705 | + dctx->rset = 1; |
| 2706 | } |
| 2707 | - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); |
| 2708 | - poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, |
| 2709 | - dctx->r[1].r); |
| 2710 | - src += POLY1305_BLOCK_SIZE * 4 * blocks; |
| 2711 | - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; |
| 2712 | - } |
| 2713 | - |
| 2714 | - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { |
| 2715 | - if (unlikely(dctx->rset < 2)) { |
| 2716 | - dctx->r[1] = dctx->r[0]; |
| 2717 | - poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); |
| 2718 | - dctx->rset = 2; |
| 2719 | + if (len >= POLY1305_BLOCK_SIZE) { |
| 2720 | + dctx->s[0] = get_unaligned_le32(&inp[0]); |
| 2721 | + dctx->s[1] = get_unaligned_le32(&inp[4]); |
| 2722 | + dctx->s[2] = get_unaligned_le32(&inp[8]); |
| 2723 | + dctx->s[3] = get_unaligned_le32(&inp[12]); |
| 2724 | + inp += POLY1305_BLOCK_SIZE; |
| 2725 | + len -= POLY1305_BLOCK_SIZE; |
| 2726 | + acc += POLY1305_BLOCK_SIZE; |
| 2727 | + dctx->sset = true; |
| 2728 | } |
| 2729 | - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); |
| 2730 | - poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, |
| 2731 | - blocks, dctx->r[1].r); |
| 2732 | - src += POLY1305_BLOCK_SIZE * 2 * blocks; |
| 2733 | - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; |
| 2734 | - } |
| 2735 | - if (srclen >= POLY1305_BLOCK_SIZE) { |
| 2736 | - poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); |
| 2737 | - srclen -= POLY1305_BLOCK_SIZE; |
| 2738 | } |
| 2739 | - return srclen; |
| 2740 | + return acc; |
| 2741 | } |
| 2742 | |
| 2743 | void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, |
| 2744 | unsigned int srclen) |
| 2745 | { |
| 2746 | - unsigned int bytes; |
| 2747 | + unsigned int bytes, used; |
| 2748 | |
| 2749 | if (unlikely(dctx->buflen)) { |
| 2750 | bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); |
| 2751 | @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130 |
| 2752 | dctx->buflen += bytes; |
| 2753 | |
| 2754 | if (dctx->buflen == POLY1305_BLOCK_SIZE) { |
| 2755 | - if (static_branch_likely(&poly1305_use_simd) && |
| 2756 | - likely(crypto_simd_usable())) { |
| 2757 | - kernel_fpu_begin(); |
| 2758 | - poly1305_simd_blocks(dctx, dctx->buf, |
| 2759 | - POLY1305_BLOCK_SIZE); |
| 2760 | - kernel_fpu_end(); |
| 2761 | - } else { |
| 2762 | - poly1305_scalar_blocks(dctx, dctx->buf, |
| 2763 | - POLY1305_BLOCK_SIZE); |
| 2764 | - } |
| 2765 | + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) |
| 2766 | + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); |
| 2767 | dctx->buflen = 0; |
| 2768 | } |
| 2769 | } |
| 2770 | |
| 2771 | if (likely(srclen >= POLY1305_BLOCK_SIZE)) { |
| 2772 | - if (static_branch_likely(&poly1305_use_simd) && |
| 2773 | - likely(crypto_simd_usable())) { |
| 2774 | - kernel_fpu_begin(); |
| 2775 | - bytes = poly1305_simd_blocks(dctx, src, srclen); |
| 2776 | - kernel_fpu_end(); |
| 2777 | - } else { |
| 2778 | - bytes = poly1305_scalar_blocks(dctx, src, srclen); |
| 2779 | - } |
| 2780 | - src += srclen - bytes; |
| 2781 | - srclen = bytes; |
| 2782 | + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); |
| 2783 | + srclen -= bytes; |
| 2784 | + used = crypto_poly1305_setdctxkey(dctx, src, bytes); |
| 2785 | + if (likely(bytes - used)) |
| 2786 | + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); |
| 2787 | + src += bytes; |
| 2788 | } |
| 2789 | |
| 2790 | if (unlikely(srclen)) { |
| 2791 | @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130 |
| 2792 | } |
| 2793 | EXPORT_SYMBOL(poly1305_update_arch); |
| 2794 | |
| 2795 | -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst) |
| 2796 | +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) |
| 2797 | { |
| 2798 | - __le32 digest[4]; |
| 2799 | - u64 f = 0; |
| 2800 | - |
| 2801 | - if (unlikely(desc->buflen)) { |
| 2802 | - desc->buf[desc->buflen++] = 1; |
| 2803 | - memset(desc->buf + desc->buflen, 0, |
| 2804 | - POLY1305_BLOCK_SIZE - desc->buflen); |
| 2805 | - poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0); |
| 2806 | + if (unlikely(dctx->buflen)) { |
| 2807 | + dctx->buf[dctx->buflen++] = 1; |
| 2808 | + memset(dctx->buf + dctx->buflen, 0, |
| 2809 | + POLY1305_BLOCK_SIZE - dctx->buflen); |
| 2810 | + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); |
| 2811 | } |
| 2812 | |
| 2813 | - poly1305_integer_emit(&desc->h, digest); |
| 2814 | - |
| 2815 | - /* mac = (h + s) % (2^128) */ |
| 2816 | - f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0]; |
| 2817 | - put_unaligned_le32(f, dst + 0); |
| 2818 | - f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1]; |
| 2819 | - put_unaligned_le32(f, dst + 4); |
| 2820 | - f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2]; |
| 2821 | - put_unaligned_le32(f, dst + 8); |
| 2822 | - f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3]; |
| 2823 | - put_unaligned_le32(f, dst + 12); |
| 2824 | - |
| 2825 | - *desc = (struct poly1305_desc_ctx){}; |
| 2826 | + poly1305_simd_emit(&dctx->h, dst, dctx->s); |
| 2827 | + *dctx = (struct poly1305_desc_ctx){}; |
| 2828 | } |
| 2829 | EXPORT_SYMBOL(poly1305_final_arch); |
| 2830 | |
| 2831 | @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s |
| 2832 | { |
| 2833 | struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); |
| 2834 | |
| 2835 | - poly1305_core_init(&dctx->h); |
| 2836 | - dctx->buflen = 0; |
| 2837 | - dctx->rset = 0; |
| 2838 | - dctx->sset = false; |
| 2839 | - |
| 2840 | + *dctx = (struct poly1305_desc_ctx){}; |
| 2841 | return 0; |
| 2842 | } |
| 2843 | |
| 2844 | -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) |
| 2845 | +static int crypto_poly1305_update(struct shash_desc *desc, |
| 2846 | + const u8 *src, unsigned int srclen) |
| 2847 | { |
| 2848 | struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); |
| 2849 | |
| 2850 | - if (unlikely(!dctx->sset)) |
| 2851 | - return -ENOKEY; |
| 2852 | - |
| 2853 | - poly1305_final_arch(dctx, dst); |
| 2854 | + poly1305_update_arch(dctx, src, srclen); |
| 2855 | return 0; |
| 2856 | } |
| 2857 | |
| 2858 | -static int poly1305_simd_update(struct shash_desc *desc, |
| 2859 | - const u8 *src, unsigned int srclen) |
| 2860 | +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) |
| 2861 | { |
| 2862 | struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); |
| 2863 | |
| 2864 | - poly1305_update_arch(dctx, src, srclen); |
| 2865 | + if (unlikely(!dctx->sset)) |
| 2866 | + return -ENOKEY; |
| 2867 | + |
| 2868 | + poly1305_final_arch(dctx, dst); |
| 2869 | return 0; |
| 2870 | } |
| 2871 | |
| 2872 | static struct shash_alg alg = { |
| 2873 | .digestsize = POLY1305_DIGEST_SIZE, |
| 2874 | .init = crypto_poly1305_init, |
| 2875 | - .update = poly1305_simd_update, |
| 2876 | + .update = crypto_poly1305_update, |
| 2877 | .final = crypto_poly1305_final, |
| 2878 | .descsize = sizeof(struct poly1305_desc_ctx), |
| 2879 | .base = { |
| 2880 | @@ -406,17 +265,19 @@ static struct shash_alg alg = { |
| 2881 | |
| 2882 | static int __init poly1305_simd_mod_init(void) |
| 2883 | { |
| 2884 | - if (!boot_cpu_has(X86_FEATURE_XMM2)) |
| 2885 | - return 0; |
| 2886 | - |
| 2887 | - static_branch_enable(&poly1305_use_simd); |
| 2888 | - |
| 2889 | - if (IS_ENABLED(CONFIG_AS_AVX2) && |
| 2890 | - boot_cpu_has(X86_FEATURE_AVX) && |
| 2891 | + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && |
| 2892 | + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) |
| 2893 | + static_branch_enable(&poly1305_use_avx); |
| 2894 | + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && |
| 2895 | boot_cpu_has(X86_FEATURE_AVX2) && |
| 2896 | cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) |
| 2897 | static_branch_enable(&poly1305_use_avx2); |
| 2898 | - |
| 2899 | + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && |
| 2900 | + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && |
| 2901 | + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && |
| 2902 | + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ |
| 2903 | + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) |
| 2904 | + static_branch_enable(&poly1305_use_avx512); |
| 2905 | return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; |
| 2906 | } |
| 2907 | |
| 2908 | @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init); |
| 2909 | module_exit(poly1305_simd_mod_exit); |
| 2910 | |
| 2911 | MODULE_LICENSE("GPL"); |
| 2912 | -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); |
| 2913 | +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); |
| 2914 | MODULE_DESCRIPTION("Poly1305 authenticator"); |
| 2915 | MODULE_ALIAS_CRYPTO("poly1305"); |
| 2916 | MODULE_ALIAS_CRYPTO("poly1305-simd"); |
| 2917 | --- a/lib/crypto/Kconfig |
| 2918 | +++ b/lib/crypto/Kconfig |
| 2919 | @@ -65,7 +65,7 @@ config CRYPTO_LIB_DES |
| 2920 | config CRYPTO_LIB_POLY1305_RSIZE |
| 2921 | int |
| 2922 | default 2 if MIPS |
| 2923 | - default 4 if X86_64 |
| 2924 | + default 11 if X86_64 |
| 2925 | default 9 if ARM || ARM64 |
| 2926 | default 1 |
| 2927 | |