Blame - target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch - T108

blob: 759ad904602436547feaf5dab67f4b18edb284fc [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
				2	From: "Jason A. Donenfeld" <Jason@zx2c4.com>
				3	Date: Sun, 5 Jan 2020 22:40:48 -0500
				4	Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for
				5	kernel
				6
				7	commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
				8
				9	These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
				10	The AVX-512F implementation is disabled on Skylake, due to throttling,
				11	but it is quite fast on >= Cannonlake.
				12
				13	On the left is cycle counts on a Core i7 6700HQ using the AVX-2
				14	codepath, comparing this implementation ("new") to the implementation in
				15	the current crypto api ("old"). On the right are benchmarks on a Xeon
				16	Gold 5120 using the AVX-512 codepath. The new implementation is faster
				17	on all benchmarks.
				18
				19	AVX-2 AVX-512
				20	--------- -----------
				21
				22	size old new size old new
				23	---- ---- ---- ---- ---- ----
				24	0 70 68 0 74 70
				25	16 92 90 16 96 92
				26	32 134 104 32 136 106
				27	48 172 120 48 184 124
				28	64 218 136 64 218 138
				29	80 254 158 80 260 160
				30	96 298 174 96 300 176
				31	112 342 192 112 342 194
				32	128 388 212 128 384 212
				33	144 428 228 144 420 226
				34	160 466 246 160 464 248
				35	176 510 264 176 504 264
				36	192 550 282 192 544 282
				37	208 594 302 208 582 300
				38	224 628 316 224 624 318
				39	240 676 334 240 662 338
				40	256 716 354 256 708 358
				41	272 764 374 272 748 372
				42	288 802 352 288 788 358
				43	304 420 366 304 422 370
				44	320 428 360 320 432 364
				45	336 484 378 336 486 380
				46	352 426 384 352 434 390
				47	368 478 400 368 480 408
				48	384 488 394 384 490 398
				49	400 542 408 400 542 412
				50	416 486 416 416 492 426
				51	432 534 430 432 538 436
				52	448 544 422 448 546 432
				53	464 600 438 464 600 448
				54	480 540 448 480 548 456
				55	496 594 464 496 594 476
				56	512 602 456 512 606 470
				57	528 656 476 528 656 480
				58	544 600 480 544 606 498
				59	560 650 494 560 652 512
				60	576 664 490 576 662 508
				61	592 714 508 592 716 522
				62	608 656 514 608 664 538
				63	624 708 532 624 710 552
				64	640 716 524 640 720 516
				65	656 770 536 656 772 526
				66	672 716 548 672 722 544
				67	688 770 562 688 768 556
				68	704 774 552 704 778 556
				69	720 826 568 720 832 568
				70	736 768 574 736 780 584
				71	752 822 592 752 826 600
				72	768 830 584 768 836 560
				73	784 884 602 784 888 572
				74	800 828 610 800 838 588
				75	816 884 628 816 884 604
				76	832 888 618 832 894 598
				77	848 942 632 848 946 612
				78	864 884 644 864 896 628
				79	880 936 660 880 942 644
				80	896 948 652 896 952 608
				81	912 1000 664 912 1004 616
				82	928 942 676 928 954 634
				83	944 994 690 944 1000 646
				84	960 1002 680 960 1008 646
				85	976 1054 694 976 1062 658
				86	992 1002 706 992 1012 674
				87	1008 1052 720 1008 1058 690
				88
				89	This commit wires in the prior implementation from Andy, and makes the
				90	following changes to be suitable for kernel land.
				91
				92	- Some cosmetic and structural changes, like renaming labels to
				93	.Lname, constants, and other Linux conventions, as well as making
				94	the code easy for us to maintain moving forward.
				95
				96	- CPU feature checking is done in C by the glue code.
				97
				98	- We avoid jumping into the middle of functions, to appease objtool,
				99	and instead parameterize shared code.
				100
				101	- We maintain frame pointers so that stack traces make sense.
				102
				103	- We remove the dependency on the perl xlate code, which transforms
				104	the output into things that assemblers we don't care about use.
				105
				106	Importantly, none of our changes affect the arithmetic or core code, but
				107	just involve the differing environment of kernel space.
				108
				109	Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
				110	Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
				111	Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
				112	Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
				113	Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
				114	---
				115	arch/x86/crypto/.gitignore \| 1 +
				116	arch/x86/crypto/Makefile \| 11 +-
				117	arch/x86/crypto/poly1305-avx2-x86_64.S \| 390 ----------
				118	arch/x86/crypto/poly1305-sse2-x86_64.S \| 590 ---------------
				119	arch/x86/crypto/poly1305-x86_64-cryptogams.pl \| 682 ++++++++++--------
				120	arch/x86/crypto/poly1305_glue.c \| 473 +++++-------
				121	lib/crypto/Kconfig \| 2 +-
				122	7 files changed, 572 insertions(+), 1577 deletions(-)
				123	create mode 100644 arch/x86/crypto/.gitignore
				124	delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
				125	delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
				126
				127	--- /dev/null
				128	+++ b/arch/x86/crypto/.gitignore
				129	@@ -0,0 +1 @@
				130	+poly1305-x86_64.S
				131	--- a/arch/x86/crypto/Makefile
				132	+++ b/arch/x86/crypto/Makefile
				133	@@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
				134
				135	nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
				136	blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
				137	+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
				138	+ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
				139	+targets += poly1305-x86_64-cryptogams.S
				140	+endif
				141
				142	ifeq ($(avx_supported),yes)
				143	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
				144	@@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
				145	aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
				146	ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
				147	sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
				148	-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
				149	ifeq ($(avx2_supported),yes)
				150	sha1-ssse3-y += sha1_avx2_x86_64_asm.o
				151	-poly1305-x86_64-y += poly1305-avx2-x86_64.o
				152	endif
				153	ifeq ($(sha1_ni_supported),yes)
				154	sha1-ssse3-y += sha1_ni_asm.o
				155	@@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
				156	endif
				157	sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
				158	crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
				159	+
				160	+quiet_cmd_perlasm = PERLASM $@
				161	+ cmd_perlasm = $(PERL) $< > $@
				162	+$(obj)/%.S: $(src)/%.pl FORCE
				163	+ $(call if_changed,perlasm)
				164	--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
				165	+++ /dev/null
				166	@@ -1,390 +0,0 @@
				167	-/* SPDX-License-Identifier: GPL-2.0-or-later */
				168	-/*
				169	- * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
				170	- *
				171	- * Copyright (C) 2015 Martin Willi
				172	- */
				173	-
				174	-#include <linux/linkage.h>
				175	-
				176	-.section .rodata.cst32.ANMASK, "aM", @progbits, 32
				177	-.align 32
				178	-ANMASK: .octa 0x0000000003ffffff0000000003ffffff
				179	- .octa 0x0000000003ffffff0000000003ffffff
				180	-
				181	-.section .rodata.cst32.ORMASK, "aM", @progbits, 32
				182	-.align 32
				183	-ORMASK: .octa 0x00000000010000000000000001000000
				184	- .octa 0x00000000010000000000000001000000
				185	-
				186	-.text
				187	-
				188	-#define h0 0x00(%rdi)
				189	-#define h1 0x04(%rdi)
				190	-#define h2 0x08(%rdi)
				191	-#define h3 0x0c(%rdi)
				192	-#define h4 0x10(%rdi)
				193	-#define r0 0x00(%rdx)
				194	-#define r1 0x04(%rdx)
				195	-#define r2 0x08(%rdx)
				196	-#define r3 0x0c(%rdx)
				197	-#define r4 0x10(%rdx)
				198	-#define u0 0x00(%r8)
				199	-#define u1 0x04(%r8)
				200	-#define u2 0x08(%r8)
				201	-#define u3 0x0c(%r8)
				202	-#define u4 0x10(%r8)
				203	-#define w0 0x18(%r8)
				204	-#define w1 0x1c(%r8)
				205	-#define w2 0x20(%r8)
				206	-#define w3 0x24(%r8)
				207	-#define w4 0x28(%r8)
				208	-#define y0 0x30(%r8)
				209	-#define y1 0x34(%r8)
				210	-#define y2 0x38(%r8)
				211	-#define y3 0x3c(%r8)
				212	-#define y4 0x40(%r8)
				213	-#define m %rsi
				214	-#define hc0 %ymm0
				215	-#define hc1 %ymm1
				216	-#define hc2 %ymm2
				217	-#define hc3 %ymm3
				218	-#define hc4 %ymm4
				219	-#define hc0x %xmm0
				220	-#define hc1x %xmm1
				221	-#define hc2x %xmm2
				222	-#define hc3x %xmm3
				223	-#define hc4x %xmm4
				224	-#define t1 %ymm5
				225	-#define t2 %ymm6
				226	-#define t1x %xmm5
				227	-#define t2x %xmm6
				228	-#define ruwy0 %ymm7
				229	-#define ruwy1 %ymm8
				230	-#define ruwy2 %ymm9
				231	-#define ruwy3 %ymm10
				232	-#define ruwy4 %ymm11
				233	-#define ruwy0x %xmm7
				234	-#define ruwy1x %xmm8
				235	-#define ruwy2x %xmm9
				236	-#define ruwy3x %xmm10
				237	-#define ruwy4x %xmm11
				238	-#define svxz1 %ymm12
				239	-#define svxz2 %ymm13
				240	-#define svxz3 %ymm14
				241	-#define svxz4 %ymm15
				242	-#define d0 %r9
				243	-#define d1 %r10
				244	-#define d2 %r11
				245	-#define d3 %r12
				246	-#define d4 %r13
				247	-
				248	-ENTRY(poly1305_4block_avx2)
				249	- # %rdi: Accumulator h[5]
				250	- # %rsi: 64 byte input block m
				251	- # %rdx: Poly1305 key r[5]
				252	- # %rcx: Quadblock count
				253	- # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
				254	-
				255	- # This four-block variant uses loop unrolled block processing. It
				256	- # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
				257	- # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
				258	-
				259	- vzeroupper
				260	- push %rbx
				261	- push %r12
				262	- push %r13
				263	-
				264	- # combine r0,u0,w0,y0
				265	- vmovd y0,ruwy0x
				266	- vmovd w0,t1x
				267	- vpunpcklqdq t1,ruwy0,ruwy0
				268	- vmovd u0,t1x
				269	- vmovd r0,t2x
				270	- vpunpcklqdq t2,t1,t1
				271	- vperm2i128 $0x20,t1,ruwy0,ruwy0
				272	-
				273	- # combine r1,u1,w1,y1 and s1=r15,v1=u15,x1=w15,z1=y15
				274	- vmovd y1,ruwy1x
				275	- vmovd w1,t1x
				276	- vpunpcklqdq t1,ruwy1,ruwy1
				277	- vmovd u1,t1x
				278	- vmovd r1,t2x
				279	- vpunpcklqdq t2,t1,t1
				280	- vperm2i128 $0x20,t1,ruwy1,ruwy1
				281	- vpslld $2,ruwy1,svxz1
				282	- vpaddd ruwy1,svxz1,svxz1
				283	-
				284	- # combine r2,u2,w2,y2 and s2=r25,v2=u25,x2=w25,z2=y25
				285	- vmovd y2,ruwy2x
				286	- vmovd w2,t1x
				287	- vpunpcklqdq t1,ruwy2,ruwy2
				288	- vmovd u2,t1x
				289	- vmovd r2,t2x
				290	- vpunpcklqdq t2,t1,t1
				291	- vperm2i128 $0x20,t1,ruwy2,ruwy2
				292	- vpslld $2,ruwy2,svxz2
				293	- vpaddd ruwy2,svxz2,svxz2
				294	-
				295	- # combine r3,u3,w3,y3 and s3=r35,v3=u35,x3=w35,z3=y35
				296	- vmovd y3,ruwy3x
				297	- vmovd w3,t1x
				298	- vpunpcklqdq t1,ruwy3,ruwy3
				299	- vmovd u3,t1x
				300	- vmovd r3,t2x
				301	- vpunpcklqdq t2,t1,t1
				302	- vperm2i128 $0x20,t1,ruwy3,ruwy3
				303	- vpslld $2,ruwy3,svxz3
				304	- vpaddd ruwy3,svxz3,svxz3
				305	-
				306	- # combine r4,u4,w4,y4 and s4=r45,v4=u45,x4=w45,z4=y45
				307	- vmovd y4,ruwy4x
				308	- vmovd w4,t1x
				309	- vpunpcklqdq t1,ruwy4,ruwy4
				310	- vmovd u4,t1x
				311	- vmovd r4,t2x
				312	- vpunpcklqdq t2,t1,t1
				313	- vperm2i128 $0x20,t1,ruwy4,ruwy4
				314	- vpslld $2,ruwy4,svxz4
				315	- vpaddd ruwy4,svxz4,svxz4
				316	-
				317	-.Ldoblock4:
				318	- # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
				319	- # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
				320	- vmovd 0x00(m),hc0x
				321	- vmovd 0x10(m),t1x
				322	- vpunpcklqdq t1,hc0,hc0
				323	- vmovd 0x20(m),t1x
				324	- vmovd 0x30(m),t2x
				325	- vpunpcklqdq t2,t1,t1
				326	- vperm2i128 $0x20,t1,hc0,hc0
				327	- vpand ANMASK(%rip),hc0,hc0
				328	- vmovd h0,t1x
				329	- vpaddd t1,hc0,hc0
				330	- # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
				331	- # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
				332	- vmovd 0x03(m),hc1x
				333	- vmovd 0x13(m),t1x
				334	- vpunpcklqdq t1,hc1,hc1
				335	- vmovd 0x23(m),t1x
				336	- vmovd 0x33(m),t2x
				337	- vpunpcklqdq t2,t1,t1
				338	- vperm2i128 $0x20,t1,hc1,hc1
				339	- vpsrld $2,hc1,hc1
				340	- vpand ANMASK(%rip),hc1,hc1
				341	- vmovd h1,t1x
				342	- vpaddd t1,hc1,hc1
				343	- # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
				344	- # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
				345	- vmovd 0x06(m),hc2x
				346	- vmovd 0x16(m),t1x
				347	- vpunpcklqdq t1,hc2,hc2
				348	- vmovd 0x26(m),t1x
				349	- vmovd 0x36(m),t2x
				350	- vpunpcklqdq t2,t1,t1
				351	- vperm2i128 $0x20,t1,hc2,hc2
				352	- vpsrld $4,hc2,hc2
				353	- vpand ANMASK(%rip),hc2,hc2
				354	- vmovd h2,t1x
				355	- vpaddd t1,hc2,hc2
				356	- # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
				357	- # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
				358	- vmovd 0x09(m),hc3x
				359	- vmovd 0x19(m),t1x
				360	- vpunpcklqdq t1,hc3,hc3
				361	- vmovd 0x29(m),t1x
				362	- vmovd 0x39(m),t2x
				363	- vpunpcklqdq t2,t1,t1
				364	- vperm2i128 $0x20,t1,hc3,hc3
				365	- vpsrld $6,hc3,hc3
				366	- vpand ANMASK(%rip),hc3,hc3
				367	- vmovd h3,t1x
				368	- vpaddd t1,hc3,hc3
				369	- # hc4 = [(m[60-63] >> 8) \| (1<<24), (m[44-47] >> 8) \| (1<<24),
				370	- # (m[28-31] >> 8) \| (1<<24), (m[12-15] >> 8) \| (1<<24) + h4]
				371	- vmovd 0x0c(m),hc4x
				372	- vmovd 0x1c(m),t1x
				373	- vpunpcklqdq t1,hc4,hc4
				374	- vmovd 0x2c(m),t1x
				375	- vmovd 0x3c(m),t2x
				376	- vpunpcklqdq t2,t1,t1
				377	- vperm2i128 $0x20,t1,hc4,hc4
				378	- vpsrld $8,hc4,hc4
				379	- vpor ORMASK(%rip),hc4,hc4
				380	- vmovd h4,t1x
				381	- vpaddd t1,hc4,hc4
				382	-
				383	- # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
				384	- vpmuludq hc0,ruwy0,t1
				385	- # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
				386	- vpmuludq hc1,svxz4,t2
				387	- vpaddq t2,t1,t1
				388	- # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
				389	- vpmuludq hc2,svxz3,t2
				390	- vpaddq t2,t1,t1
				391	- # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
				392	- vpmuludq hc3,svxz2,t2
				393	- vpaddq t2,t1,t1
				394	- # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
				395	- vpmuludq hc4,svxz1,t2
				396	- vpaddq t2,t1,t1
				397	- # d0 = t1[0] + t1[1] + t[2] + t[3]
				398	- vpermq $0xee,t1,t2
				399	- vpaddq t2,t1,t1
				400	- vpsrldq $8,t1,t2
				401	- vpaddq t2,t1,t1
				402	- vmovq t1x,d0
				403	-
				404	- # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
				405	- vpmuludq hc0,ruwy1,t1
				406	- # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
				407	- vpmuludq hc1,ruwy0,t2
				408	- vpaddq t2,t1,t1
				409	- # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
				410	- vpmuludq hc2,svxz4,t2
				411	- vpaddq t2,t1,t1
				412	- # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
				413	- vpmuludq hc3,svxz3,t2
				414	- vpaddq t2,t1,t1
				415	- # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
				416	- vpmuludq hc4,svxz2,t2
				417	- vpaddq t2,t1,t1
				418	- # d1 = t1[0] + t1[1] + t1[3] + t1[4]
				419	- vpermq $0xee,t1,t2
				420	- vpaddq t2,t1,t1
				421	- vpsrldq $8,t1,t2
				422	- vpaddq t2,t1,t1
				423	- vmovq t1x,d1
				424	-
				425	- # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
				426	- vpmuludq hc0,ruwy2,t1
				427	- # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
				428	- vpmuludq hc1,ruwy1,t2
				429	- vpaddq t2,t1,t1
				430	- # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
				431	- vpmuludq hc2,ruwy0,t2
				432	- vpaddq t2,t1,t1
				433	- # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
				434	- vpmuludq hc3,svxz4,t2
				435	- vpaddq t2,t1,t1
				436	- # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
				437	- vpmuludq hc4,svxz3,t2
				438	- vpaddq t2,t1,t1
				439	- # d2 = t1[0] + t1[1] + t1[2] + t1[3]
				440	- vpermq $0xee,t1,t2
				441	- vpaddq t2,t1,t1
				442	- vpsrldq $8,t1,t2
				443	- vpaddq t2,t1,t1
				444	- vmovq t1x,d2
				445	-
				446	- # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
				447	- vpmuludq hc0,ruwy3,t1
				448	- # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
				449	- vpmuludq hc1,ruwy2,t2
				450	- vpaddq t2,t1,t1
				451	- # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
				452	- vpmuludq hc2,ruwy1,t2
				453	- vpaddq t2,t1,t1
				454	- # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
				455	- vpmuludq hc3,ruwy0,t2
				456	- vpaddq t2,t1,t1
				457	- # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
				458	- vpmuludq hc4,svxz4,t2
				459	- vpaddq t2,t1,t1
				460	- # d3 = t1[0] + t1[1] + t1[2] + t1[3]
				461	- vpermq $0xee,t1,t2
				462	- vpaddq t2,t1,t1
				463	- vpsrldq $8,t1,t2
				464	- vpaddq t2,t1,t1
				465	- vmovq t1x,d3
				466	-
				467	- # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
				468	- vpmuludq hc0,ruwy4,t1
				469	- # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
				470	- vpmuludq hc1,ruwy3,t2
				471	- vpaddq t2,t1,t1
				472	- # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
				473	- vpmuludq hc2,ruwy2,t2
				474	- vpaddq t2,t1,t1
				475	- # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
				476	- vpmuludq hc3,ruwy1,t2
				477	- vpaddq t2,t1,t1
				478	- # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
				479	- vpmuludq hc4,ruwy0,t2
				480	- vpaddq t2,t1,t1
				481	- # d4 = t1[0] + t1[1] + t1[2] + t1[3]
				482	- vpermq $0xee,t1,t2
				483	- vpaddq t2,t1,t1
				484	- vpsrldq $8,t1,t2
				485	- vpaddq t2,t1,t1
				486	- vmovq t1x,d4
				487	-
				488	- # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
				489	- # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
				490	- # amount. Careful: we must not assume the carry bits 'd0 >> 26',
				491	- # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
				492	- # integers. It's true in a single-block implementation, but not here.
				493	-
				494	- # d1 += d0 >> 26
				495	- mov d0,%rax
				496	- shr $26,%rax
				497	- add %rax,d1
				498	- # h0 = d0 & 0x3ffffff
				499	- mov d0,%rbx
				500	- and $0x3ffffff,%ebx
				501	-
				502	- # d2 += d1 >> 26
				503	- mov d1,%rax
				504	- shr $26,%rax
				505	- add %rax,d2
				506	- # h1 = d1 & 0x3ffffff
				507	- mov d1,%rax
				508	- and $0x3ffffff,%eax
				509	- mov %eax,h1
				510	-
				511	- # d3 += d2 >> 26
				512	- mov d2,%rax
				513	- shr $26,%rax
				514	- add %rax,d3
				515	- # h2 = d2 & 0x3ffffff
				516	- mov d2,%rax
				517	- and $0x3ffffff,%eax
				518	- mov %eax,h2
				519	-
				520	- # d4 += d3 >> 26
				521	- mov d3,%rax
				522	- shr $26,%rax
				523	- add %rax,d4
				524	- # h3 = d3 & 0x3ffffff
				525	- mov d3,%rax
				526	- and $0x3ffffff,%eax
				527	- mov %eax,h3
				528	-
				529	- # h0 += (d4 >> 26) * 5
				530	- mov d4,%rax
				531	- shr $26,%rax
				532	- lea (%rax,%rax,4),%rax
				533	- add %rax,%rbx
				534	- # h4 = d4 & 0x3ffffff
				535	- mov d4,%rax
				536	- and $0x3ffffff,%eax
				537	- mov %eax,h4
				538	-
				539	- # h1 += h0 >> 26
				540	- mov %rbx,%rax
				541	- shr $26,%rax
				542	- add %eax,h1
				543	- # h0 = h0 & 0x3ffffff
				544	- andl $0x3ffffff,%ebx
				545	- mov %ebx,h0
				546	-
				547	- add $0x40,m
				548	- dec %rcx
				549	- jnz .Ldoblock4
				550	-
				551	- vzeroupper
				552	- pop %r13
				553	- pop %r12
				554	- pop %rbx
				555	- ret
				556	-ENDPROC(poly1305_4block_avx2)
				557	--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
				558	+++ /dev/null
				559	@@ -1,590 +0,0 @@
				560	-/* SPDX-License-Identifier: GPL-2.0-or-later */
				561	-/*
				562	- * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
				563	- *
				564	- * Copyright (C) 2015 Martin Willi
				565	- */
				566	-
				567	-#include <linux/linkage.h>
				568	-
				569	-.section .rodata.cst16.ANMASK, "aM", @progbits, 16
				570	-.align 16
				571	-ANMASK: .octa 0x0000000003ffffff0000000003ffffff
				572	-
				573	-.section .rodata.cst16.ORMASK, "aM", @progbits, 16
				574	-.align 16
				575	-ORMASK: .octa 0x00000000010000000000000001000000
				576	-
				577	-.text
				578	-
				579	-#define h0 0x00(%rdi)
				580	-#define h1 0x04(%rdi)
				581	-#define h2 0x08(%rdi)
				582	-#define h3 0x0c(%rdi)
				583	-#define h4 0x10(%rdi)
				584	-#define r0 0x00(%rdx)
				585	-#define r1 0x04(%rdx)
				586	-#define r2 0x08(%rdx)
				587	-#define r3 0x0c(%rdx)
				588	-#define r4 0x10(%rdx)
				589	-#define s1 0x00(%rsp)
				590	-#define s2 0x04(%rsp)
				591	-#define s3 0x08(%rsp)
				592	-#define s4 0x0c(%rsp)
				593	-#define m %rsi
				594	-#define h01 %xmm0
				595	-#define h23 %xmm1
				596	-#define h44 %xmm2
				597	-#define t1 %xmm3
				598	-#define t2 %xmm4
				599	-#define t3 %xmm5
				600	-#define t4 %xmm6
				601	-#define mask %xmm7
				602	-#define d0 %r8
				603	-#define d1 %r9
				604	-#define d2 %r10
				605	-#define d3 %r11
				606	-#define d4 %r12
				607	-
				608	-ENTRY(poly1305_block_sse2)
				609	- # %rdi: Accumulator h[5]
				610	- # %rsi: 16 byte input block m
				611	- # %rdx: Poly1305 key r[5]
				612	- # %rcx: Block count
				613	-
				614	- # This single block variant tries to improve performance by doing two
				615	- # multiplications in parallel using SSE instructions. There is quite
				616	- # some quardword packing involved, hence the speedup is marginal.
				617	-
				618	- push %rbx
				619	- push %r12
				620	- sub $0x10,%rsp
				621	-
				622	- # s1..s4 = r1..r4 * 5
				623	- mov r1,%eax
				624	- lea (%eax,%eax,4),%eax
				625	- mov %eax,s1
				626	- mov r2,%eax
				627	- lea (%eax,%eax,4),%eax
				628	- mov %eax,s2
				629	- mov r3,%eax
				630	- lea (%eax,%eax,4),%eax
				631	- mov %eax,s3
				632	- mov r4,%eax
				633	- lea (%eax,%eax,4),%eax
				634	- mov %eax,s4
				635	-
				636	- movdqa ANMASK(%rip),mask
				637	-
				638	-.Ldoblock:
				639	- # h01 = [0, h1, 0, h0]
				640	- # h23 = [0, h3, 0, h2]
				641	- # h44 = [0, h4, 0, h4]
				642	- movd h0,h01
				643	- movd h1,t1
				644	- movd h2,h23
				645	- movd h3,t2
				646	- movd h4,h44
				647	- punpcklqdq t1,h01
				648	- punpcklqdq t2,h23
				649	- punpcklqdq h44,h44
				650	-
				651	- # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
				652	- movd 0x00(m),t1
				653	- movd 0x03(m),t2
				654	- psrld $2,t2
				655	- punpcklqdq t2,t1
				656	- pand mask,t1
				657	- paddd t1,h01
				658	- # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
				659	- movd 0x06(m),t1
				660	- movd 0x09(m),t2
				661	- psrld $4,t1
				662	- psrld $6,t2
				663	- punpcklqdq t2,t1
				664	- pand mask,t1
				665	- paddd t1,h23
				666	- # h44 += [ (m[12-15] >> 8) \| (1 << 24), (m[12-15] >> 8) \| (1 << 24) ]
				667	- mov 0x0c(m),%eax
				668	- shr $8,%eax
				669	- or $0x01000000,%eax
				670	- movd %eax,t1
				671	- pshufd $0xc4,t1,t1
				672	- paddd t1,h44
				673	-
				674	- # t1[0] = h0 * r0 + h2 * s3
				675	- # t1[1] = h1 * s4 + h3 * s2
				676	- movd r0,t1
				677	- movd s4,t2
				678	- punpcklqdq t2,t1
				679	- pmuludq h01,t1
				680	- movd s3,t2
				681	- movd s2,t3
				682	- punpcklqdq t3,t2
				683	- pmuludq h23,t2
				684	- paddq t2,t1
				685	- # t2[0] = h0 * r1 + h2 * s4
				686	- # t2[1] = h1 * r0 + h3 * s3
				687	- movd r1,t2
				688	- movd r0,t3
				689	- punpcklqdq t3,t2
				690	- pmuludq h01,t2
				691	- movd s4,t3
				692	- movd s3,t4
				693	- punpcklqdq t4,t3
				694	- pmuludq h23,t3
				695	- paddq t3,t2
				696	- # t3[0] = h4 * s1
				697	- # t3[1] = h4 * s2
				698	- movd s1,t3
				699	- movd s2,t4
				700	- punpcklqdq t4,t3
				701	- pmuludq h44,t3
				702	- # d0 = t1[0] + t1[1] + t3[0]
				703	- # d1 = t2[0] + t2[1] + t3[1]
				704	- movdqa t1,t4
				705	- punpcklqdq t2,t4
				706	- punpckhqdq t2,t1
				707	- paddq t4,t1
				708	- paddq t3,t1
				709	- movq t1,d0
				710	- psrldq $8,t1
				711	- movq t1,d1
				712	-
				713	- # t1[0] = h0 * r2 + h2 * r0
				714	- # t1[1] = h1 * r1 + h3 * s4
				715	- movd r2,t1
				716	- movd r1,t2
				717	- punpcklqdq t2,t1
				718	- pmuludq h01,t1
				719	- movd r0,t2
				720	- movd s4,t3
				721	- punpcklqdq t3,t2
				722	- pmuludq h23,t2
				723	- paddq t2,t1
				724	- # t2[0] = h0 * r3 + h2 * r1
				725	- # t2[1] = h1 * r2 + h3 * r0
				726	- movd r3,t2
				727	- movd r2,t3
				728	- punpcklqdq t3,t2
				729	- pmuludq h01,t2
				730	- movd r1,t3
				731	- movd r0,t4
				732	- punpcklqdq t4,t3
				733	- pmuludq h23,t3
				734	- paddq t3,t2
				735	- # t3[0] = h4 * s3
				736	- # t3[1] = h4 * s4
				737	- movd s3,t3
				738	- movd s4,t4
				739	- punpcklqdq t4,t3
				740	- pmuludq h44,t3
				741	- # d2 = t1[0] + t1[1] + t3[0]
				742	- # d3 = t2[0] + t2[1] + t3[1]
				743	- movdqa t1,t4
				744	- punpcklqdq t2,t4
				745	- punpckhqdq t2,t1
				746	- paddq t4,t1
				747	- paddq t3,t1
				748	- movq t1,d2
				749	- psrldq $8,t1
				750	- movq t1,d3
				751	-
				752	- # t1[0] = h0 * r4 + h2 * r2
				753	- # t1[1] = h1 * r3 + h3 * r1
				754	- movd r4,t1
				755	- movd r3,t2
				756	- punpcklqdq t2,t1
				757	- pmuludq h01,t1
				758	- movd r2,t2
				759	- movd r1,t3
				760	- punpcklqdq t3,t2
				761	- pmuludq h23,t2
				762	- paddq t2,t1
				763	- # t3[0] = h4 * r0
				764	- movd r0,t3
				765	- pmuludq h44,t3
				766	- # d4 = t1[0] + t1[1] + t3[0]
				767	- movdqa t1,t4
				768	- psrldq $8,t4
				769	- paddq t4,t1
				770	- paddq t3,t1
				771	- movq t1,d4
				772	-
				773	- # d1 += d0 >> 26
				774	- mov d0,%rax
				775	- shr $26,%rax
				776	- add %rax,d1
				777	- # h0 = d0 & 0x3ffffff
				778	- mov d0,%rbx
				779	- and $0x3ffffff,%ebx
				780	-
				781	- # d2 += d1 >> 26
				782	- mov d1,%rax
				783	- shr $26,%rax
				784	- add %rax,d2
				785	- # h1 = d1 & 0x3ffffff
				786	- mov d1,%rax
				787	- and $0x3ffffff,%eax
				788	- mov %eax,h1
				789	-
				790	- # d3 += d2 >> 26
				791	- mov d2,%rax
				792	- shr $26,%rax
				793	- add %rax,d3
				794	- # h2 = d2 & 0x3ffffff
				795	- mov d2,%rax
				796	- and $0x3ffffff,%eax
				797	- mov %eax,h2
				798	-
				799	- # d4 += d3 >> 26
				800	- mov d3,%rax
				801	- shr $26,%rax
				802	- add %rax,d4
				803	- # h3 = d3 & 0x3ffffff
				804	- mov d3,%rax
				805	- and $0x3ffffff,%eax
				806	- mov %eax,h3
				807	-
				808	- # h0 += (d4 >> 26) * 5
				809	- mov d4,%rax
				810	- shr $26,%rax
				811	- lea (%rax,%rax,4),%rax
				812	- add %rax,%rbx
				813	- # h4 = d4 & 0x3ffffff
				814	- mov d4,%rax
				815	- and $0x3ffffff,%eax
				816	- mov %eax,h4
				817	-
				818	- # h1 += h0 >> 26
				819	- mov %rbx,%rax
				820	- shr $26,%rax
				821	- add %eax,h1
				822	- # h0 = h0 & 0x3ffffff
				823	- andl $0x3ffffff,%ebx
				824	- mov %ebx,h0
				825	-
				826	- add $0x10,m
				827	- dec %rcx
				828	- jnz .Ldoblock
				829	-
				830	- # Zeroing of key material
				831	- mov %rcx,0x00(%rsp)
				832	- mov %rcx,0x08(%rsp)
				833	-
				834	- add $0x10,%rsp
				835	- pop %r12
				836	- pop %rbx
				837	- ret
				838	-ENDPROC(poly1305_block_sse2)
				839	-
				840	-
				841	-#define u0 0x00(%r8)
				842	-#define u1 0x04(%r8)
				843	-#define u2 0x08(%r8)
				844	-#define u3 0x0c(%r8)
				845	-#define u4 0x10(%r8)
				846	-#define hc0 %xmm0
				847	-#define hc1 %xmm1
				848	-#define hc2 %xmm2
				849	-#define hc3 %xmm5
				850	-#define hc4 %xmm6
				851	-#define ru0 %xmm7
				852	-#define ru1 %xmm8
				853	-#define ru2 %xmm9
				854	-#define ru3 %xmm10
				855	-#define ru4 %xmm11
				856	-#define sv1 %xmm12
				857	-#define sv2 %xmm13
				858	-#define sv3 %xmm14
				859	-#define sv4 %xmm15
				860	-#undef d0
				861	-#define d0 %r13
				862	-
				863	-ENTRY(poly1305_2block_sse2)
				864	- # %rdi: Accumulator h[5]
				865	- # %rsi: 16 byte input block m
				866	- # %rdx: Poly1305 key r[5]
				867	- # %rcx: Doubleblock count
				868	- # %r8: Poly1305 derived key r^2 u[5]
				869	-
				870	- # This two-block variant further improves performance by using loop
				871	- # unrolled block processing. This is more straight forward and does
				872	- # less byte shuffling, but requires a second Poly1305 key r^2:
				873	- # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
				874	-
				875	- push %rbx
				876	- push %r12
				877	- push %r13
				878	-
				879	- # combine r0,u0
				880	- movd u0,ru0
				881	- movd r0,t1
				882	- punpcklqdq t1,ru0
				883	-
				884	- # combine r1,u1 and s1=r15,v1=u15
				885	- movd u1,ru1
				886	- movd r1,t1
				887	- punpcklqdq t1,ru1
				888	- movdqa ru1,sv1
				889	- pslld $2,sv1
				890	- paddd ru1,sv1
				891	-
				892	- # combine r2,u2 and s2=r25,v2=u25
				893	- movd u2,ru2
				894	- movd r2,t1
				895	- punpcklqdq t1,ru2
				896	- movdqa ru2,sv2
				897	- pslld $2,sv2
				898	- paddd ru2,sv2
				899	-
				900	- # combine r3,u3 and s3=r35,v3=u35
				901	- movd u3,ru3
				902	- movd r3,t1
				903	- punpcklqdq t1,ru3
				904	- movdqa ru3,sv3
				905	- pslld $2,sv3
				906	- paddd ru3,sv3
				907	-
				908	- # combine r4,u4 and s4=r45,v4=u45
				909	- movd u4,ru4
				910	- movd r4,t1
				911	- punpcklqdq t1,ru4
				912	- movdqa ru4,sv4
				913	- pslld $2,sv4
				914	- paddd ru4,sv4
				915	-
				916	-.Ldoblock2:
				917	- # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
				918	- movd 0x00(m),hc0
				919	- movd 0x10(m),t1
				920	- punpcklqdq t1,hc0
				921	- pand ANMASK(%rip),hc0
				922	- movd h0,t1
				923	- paddd t1,hc0
				924	- # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
				925	- movd 0x03(m),hc1
				926	- movd 0x13(m),t1
				927	- punpcklqdq t1,hc1
				928	- psrld $2,hc1
				929	- pand ANMASK(%rip),hc1
				930	- movd h1,t1
				931	- paddd t1,hc1
				932	- # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
				933	- movd 0x06(m),hc2
				934	- movd 0x16(m),t1
				935	- punpcklqdq t1,hc2
				936	- psrld $4,hc2
				937	- pand ANMASK(%rip),hc2
				938	- movd h2,t1
				939	- paddd t1,hc2
				940	- # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
				941	- movd 0x09(m),hc3
				942	- movd 0x19(m),t1
				943	- punpcklqdq t1,hc3
				944	- psrld $6,hc3
				945	- pand ANMASK(%rip),hc3
				946	- movd h3,t1
				947	- paddd t1,hc3
				948	- # hc4 = [ (m[28-31] >> 8) \| (1<<24), h4 + (m[12-15] >> 8) \| (1<<24) ]
				949	- movd 0x0c(m),hc4
				950	- movd 0x1c(m),t1
				951	- punpcklqdq t1,hc4
				952	- psrld $8,hc4
				953	- por ORMASK(%rip),hc4
				954	- movd h4,t1
				955	- paddd t1,hc4
				956	-
				957	- # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
				958	- movdqa ru0,t1
				959	- pmuludq hc0,t1
				960	- # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
				961	- movdqa sv4,t2
				962	- pmuludq hc1,t2
				963	- paddq t2,t1
				964	- # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
				965	- movdqa sv3,t2
				966	- pmuludq hc2,t2
				967	- paddq t2,t1
				968	- # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
				969	- movdqa sv2,t2
				970	- pmuludq hc3,t2
				971	- paddq t2,t1
				972	- # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
				973	- movdqa sv1,t2
				974	- pmuludq hc4,t2
				975	- paddq t2,t1
				976	- # d0 = t1[0] + t1[1]
				977	- movdqa t1,t2
				978	- psrldq $8,t2
				979	- paddq t2,t1
				980	- movq t1,d0
				981	-
				982	- # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
				983	- movdqa ru1,t1
				984	- pmuludq hc0,t1
				985	- # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
				986	- movdqa ru0,t2
				987	- pmuludq hc1,t2
				988	- paddq t2,t1
				989	- # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
				990	- movdqa sv4,t2
				991	- pmuludq hc2,t2
				992	- paddq t2,t1
				993	- # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
				994	- movdqa sv3,t2
				995	- pmuludq hc3,t2
				996	- paddq t2,t1
				997	- # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
				998	- movdqa sv2,t2
				999	- pmuludq hc4,t2
				1000	- paddq t2,t1
				1001	- # d1 = t1[0] + t1[1]
				1002	- movdqa t1,t2
				1003	- psrldq $8,t2
				1004	- paddq t2,t1
				1005	- movq t1,d1
				1006	-
				1007	- # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
				1008	- movdqa ru2,t1
				1009	- pmuludq hc0,t1
				1010	- # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
				1011	- movdqa ru1,t2
				1012	- pmuludq hc1,t2
				1013	- paddq t2,t1
				1014	- # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
				1015	- movdqa ru0,t2
				1016	- pmuludq hc2,t2
				1017	- paddq t2,t1
				1018	- # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
				1019	- movdqa sv4,t2
				1020	- pmuludq hc3,t2
				1021	- paddq t2,t1
				1022	- # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
				1023	- movdqa sv3,t2
				1024	- pmuludq hc4,t2
				1025	- paddq t2,t1
				1026	- # d2 = t1[0] + t1[1]
				1027	- movdqa t1,t2
				1028	- psrldq $8,t2
				1029	- paddq t2,t1
				1030	- movq t1,d2
				1031	-
				1032	- # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
				1033	- movdqa ru3,t1
				1034	- pmuludq hc0,t1
				1035	- # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
				1036	- movdqa ru2,t2
				1037	- pmuludq hc1,t2
				1038	- paddq t2,t1
				1039	- # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
				1040	- movdqa ru1,t2
				1041	- pmuludq hc2,t2
				1042	- paddq t2,t1
				1043	- # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
				1044	- movdqa ru0,t2
				1045	- pmuludq hc3,t2
				1046	- paddq t2,t1
				1047	- # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
				1048	- movdqa sv4,t2
				1049	- pmuludq hc4,t2
				1050	- paddq t2,t1
				1051	- # d3 = t1[0] + t1[1]
				1052	- movdqa t1,t2
				1053	- psrldq $8,t2
				1054	- paddq t2,t1
				1055	- movq t1,d3
				1056	-
				1057	- # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
				1058	- movdqa ru4,t1
				1059	- pmuludq hc0,t1
				1060	- # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
				1061	- movdqa ru3,t2
				1062	- pmuludq hc1,t2
				1063	- paddq t2,t1
				1064	- # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
				1065	- movdqa ru2,t2
				1066	- pmuludq hc2,t2
				1067	- paddq t2,t1
				1068	- # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
				1069	- movdqa ru1,t2
				1070	- pmuludq hc3,t2
				1071	- paddq t2,t1
				1072	- # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
				1073	- movdqa ru0,t2
				1074	- pmuludq hc4,t2
				1075	- paddq t2,t1
				1076	- # d4 = t1[0] + t1[1]
				1077	- movdqa t1,t2
				1078	- psrldq $8,t2
				1079	- paddq t2,t1
				1080	- movq t1,d4
				1081	-
				1082	- # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
				1083	- # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
				1084	- # amount. Careful: we must not assume the carry bits 'd0 >> 26',
				1085	- # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
				1086	- # integers. It's true in a single-block implementation, but not here.
				1087	-
				1088	- # d1 += d0 >> 26
				1089	- mov d0,%rax
				1090	- shr $26,%rax
				1091	- add %rax,d1
				1092	- # h0 = d0 & 0x3ffffff
				1093	- mov d0,%rbx
				1094	- and $0x3ffffff,%ebx
				1095	-
				1096	- # d2 += d1 >> 26
				1097	- mov d1,%rax
				1098	- shr $26,%rax
				1099	- add %rax,d2
				1100	- # h1 = d1 & 0x3ffffff
				1101	- mov d1,%rax
				1102	- and $0x3ffffff,%eax
				1103	- mov %eax,h1
				1104	-
				1105	- # d3 += d2 >> 26
				1106	- mov d2,%rax
				1107	- shr $26,%rax
				1108	- add %rax,d3
				1109	- # h2 = d2 & 0x3ffffff
				1110	- mov d2,%rax
				1111	- and $0x3ffffff,%eax
				1112	- mov %eax,h2
				1113	-
				1114	- # d4 += d3 >> 26
				1115	- mov d3,%rax
				1116	- shr $26,%rax
				1117	- add %rax,d4
				1118	- # h3 = d3 & 0x3ffffff
				1119	- mov d3,%rax
				1120	- and $0x3ffffff,%eax
				1121	- mov %eax,h3
				1122	-
				1123	- # h0 += (d4 >> 26) * 5
				1124	- mov d4,%rax
				1125	- shr $26,%rax
				1126	- lea (%rax,%rax,4),%rax
				1127	- add %rax,%rbx
				1128	- # h4 = d4 & 0x3ffffff
				1129	- mov d4,%rax
				1130	- and $0x3ffffff,%eax
				1131	- mov %eax,h4
				1132	-
				1133	- # h1 += h0 >> 26
				1134	- mov %rbx,%rax
				1135	- shr $26,%rax
				1136	- add %eax,h1
				1137	- # h0 = h0 & 0x3ffffff
				1138	- andl $0x3ffffff,%ebx
				1139	- mov %ebx,h0
				1140	-
				1141	- add $0x20,m
				1142	- dec %rcx
				1143	- jnz .Ldoblock2
				1144	-
				1145	- pop %r13
				1146	- pop %r12
				1147	- pop %rbx
				1148	- ret
				1149	-ENDPROC(poly1305_2block_sse2)
				1150	--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
				1151	+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
				1152	@@ -1,11 +1,14 @@
				1153	-#! /usr/bin/env perl
				1154	-# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
				1155	+#!/usr/bin/env perl
				1156	+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
				1157	#
				1158	-# Licensed under the OpenSSL license (the "License"). You may not use
				1159	-# this file except in compliance with the License. You can obtain a copy
				1160	-# in the file LICENSE in the source distribution or at
				1161	-# https://www.openssl.org/source/license.html
				1162	-
				1163	+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
				1164	+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				1165	+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
				1166	+#
				1167	+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
				1168	+# has relicensed it under the licenses specified in the SPDX header above.
				1169	+# The original headers, including the original license headers, are
				1170	+# included below for completeness.
				1171	#
				1172	# ====================================================================
				1173	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				1174	@@ -32,7 +35,7 @@
				1175	# Skylake-X system performance. Since we are likely to suppress
				1176	# AVX512F capability flag [at least on Skylake-X], conversion serves
				1177	# as kind of "investment protection". Note that next *lake processor,
				1178	-# Cannolake, has AVX512IFMA code path to execute...
				1179	+# Cannonlake, has AVX512IFMA code path to execute...
				1180	#
				1181	# Numbers are cycles per processed byte with poly1305_blocks alone,
				1182	# measured with rdtsc at fixed clock frequency.
				1183	@@ -68,39 +71,114 @@ $output = shift;
				1184	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				1185
				1186	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				1187	+$kernel=0; $kernel=1 if (!$flavour && !$output);
				1188
				1189	-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				1190	-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
				1191	-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
				1192	-die "can't locate x86_64-xlate.pl";
				1193	-
				1194	-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
				1195	- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
				1196	- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
				1197	+if (!$kernel) {
				1198	+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				1199	+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
				1200	+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
				1201	+ die "can't locate x86_64-xlate.pl";
				1202	+
				1203	+ open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
				1204	+ STDOUT=OUT;
				1205	+
				1206	+ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
				1207	+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
				1208	+ $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
				1209	+ }
				1210	+
				1211	+ if (!$avx && $win64 && ($flavour =~ /nasm/ \|\| $ENV{ASM} =~ /nasm/) &&
				1212	+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
				1213	+ $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
				1214	+ $avx += 1 if ($1==2.11 && $2>=8);
				1215	+ }
				1216	+
				1217	+ if (!$avx && $win64 && ($flavour =~ /masm/ \|\| $ENV{ASM} =~ /ml64/) &&
				1218	+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
				1219	+ $avx = ($1>=10) + ($1>=11);
				1220	+ }
				1221	+
				1222	+ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang\|LLVM) version\|.*based on LLVM) ([3-9]\.[0-9]+)/) {
				1223	+ $avx = ($2>=3.0) + ($2>3.0);
				1224	+ }
				1225	+} else {
				1226	+ $avx = 4; # The kernel uses ifdefs for this.
				1227	}
				1228
				1229	-if (!$avx && $win64 && ($flavour =~ /nasm/ \|\| $ENV{ASM} =~ /nasm/) &&
				1230	- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
				1231	- $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
				1232	- $avx += 2 if ($1==2.11 && $2>=8);
				1233	+sub declare_function() {
				1234	+ my ($name, $align, $nargs) = @_;
				1235	+ if($kernel) {
				1236	+ $code .= ".align $align\n";
				1237	+ $code .= "ENTRY($name)\n";
				1238	+ $code .= ".L$name:\n";
				1239	+ } else {
				1240	+ $code .= ".globl $name\n";
				1241	+ $code .= ".type $name,\@function,$nargs\n";
				1242	+ $code .= ".align $align\n";
				1243	+ $code .= "$name:\n";
				1244	+ }
				1245	}
				1246
				1247	-if (!$avx && $win64 && ($flavour =~ /masm/ \|\| $ENV{ASM} =~ /ml64/) &&
				1248	- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
				1249	- $avx = ($1>=10) + ($1>=12);
				1250	+sub end_function() {
				1251	+ my ($name) = @_;
				1252	+ if($kernel) {
				1253	+ $code .= "ENDPROC($name)\n";
				1254	+ } else {
				1255	+ $code .= ".size $name,.-$name\n";
				1256	+ }
				1257	}
				1258
				1259	-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang\|LLVM) version\|.*based on LLVM) ([3-9]\.[0-9]+)/) {
				1260	- $avx = ($2>=3.0) + ($2>3.0);
				1261	-}
				1262	+$code.=<<___ if $kernel;
				1263	+#include <linux/linkage.h>
				1264	+___
				1265
				1266	-open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
				1267	-STDOUT=OUT;
				1268	+if ($avx) {
				1269	+$code.=<<___ if $kernel;
				1270	+.section .rodata
				1271	+___
				1272	+$code.=<<___;
				1273	+.align 64
				1274	+.Lconst:
				1275	+.Lmask24:
				1276	+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
				1277	+.L129:
				1278	+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
				1279	+.Lmask26:
				1280	+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
				1281	+.Lpermd_avx2:
				1282	+.long 2,2,2,3,2,0,2,1
				1283	+.Lpermd_avx512:
				1284	+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
				1285	+
				1286	+.L2_44_inp_permd:
				1287	+.long 0,1,1,2,2,3,7,7
				1288	+.L2_44_inp_shift:
				1289	+.quad 0,12,24,64
				1290	+.L2_44_mask:
				1291	+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
				1292	+.L2_44_shift_rgt:
				1293	+.quad 44,44,42,64
				1294	+.L2_44_shift_lft:
				1295	+.quad 8,8,10,64
				1296	+
				1297	+.align 64
				1298	+.Lx_mask44:
				1299	+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
				1300	+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
				1301	+.Lx_mask42:
				1302	+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
				1303	+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
				1304	+___
				1305	+}
				1306	+$code.=<<___ if (!$kernel);
				1307	+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
				1308	+.align 16
				1309	+___
				1310
				1311	my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
				1312	my ($mac,$nonce)=($inp,$len); # *_emit arguments
				1313	-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
				1314	-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
				1315	+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
				1316	+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
				1317
				1318	sub poly1305_iteration {
				1319	# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
				1320	@@ -155,19 +233,19 @@ ___
				1321
				1322	$code.=<<___;
				1323	.text
				1324	-
				1325	+___
				1326	+$code.=<<___ if (!$kernel);
				1327	.extern OPENSSL_ia32cap_P
				1328
				1329	-.globl poly1305_init
				1330	-.hidden poly1305_init
				1331	-.globl poly1305_blocks
				1332	-.hidden poly1305_blocks
				1333	-.globl poly1305_emit
				1334	-.hidden poly1305_emit
				1335	-
				1336	-.type poly1305_init,\@function,3
				1337	-.align 32
				1338	-poly1305_init:
				1339	+.globl poly1305_init_x86_64
				1340	+.hidden poly1305_init_x86_64
				1341	+.globl poly1305_blocks_x86_64
				1342	+.hidden poly1305_blocks_x86_64
				1343	+.globl poly1305_emit_x86_64
				1344	+.hidden poly1305_emit_x86_64
				1345	+___
				1346	+&declare_function("poly1305_init_x86_64", 32, 3);
				1347	+$code.=<<___;
				1348	xor %rax,%rax
				1349	mov %rax,0($ctx) # initialize hash value
				1350	mov %rax,8($ctx)
				1351	@@ -175,11 +253,12 @@ poly1305_init:
				1352
				1353	cmp \$0,$inp
				1354	je .Lno_key
				1355	-
				1356	- lea poly1305_blocks(%rip),%r10
				1357	- lea poly1305_emit(%rip),%r11
				1358	___
				1359	-$code.=<<___ if ($avx);
				1360	+$code.=<<___ if (!$kernel);
				1361	+ lea poly1305_blocks_x86_64(%rip),%r10
				1362	+ lea poly1305_emit_x86_64(%rip),%r11
				1363	+___
				1364	+$code.=<<___ if (!$kernel && $avx);
				1365	mov OPENSSL_ia32cap_P+4(%rip),%r9
				1366	lea poly1305_blocks_avx(%rip),%rax
				1367	lea poly1305_emit_avx(%rip),%rcx
				1368	@@ -187,12 +266,12 @@ $code.=<<___ if ($avx);
				1369	cmovc %rax,%r10
				1370	cmovc %rcx,%r11
				1371	___
				1372	-$code.=<<___ if ($avx>1);
				1373	+$code.=<<___ if (!$kernel && $avx>1);
				1374	lea poly1305_blocks_avx2(%rip),%rax
				1375	bt \$`5+32`,%r9 # AVX2?
				1376	cmovc %rax,%r10
				1377	___
				1378	-$code.=<<___ if ($avx>3);
				1379	+$code.=<<___ if (!$kernel && $avx>3);
				1380	mov \$`(1<<31\|1<<21\|1<<16)`,%rax
				1381	shr \$32,%r9
				1382	and %rax,%r9
				1383	@@ -207,11 +286,11 @@ $code.=<<___;
				1384	mov %rax,24($ctx)
				1385	mov %rcx,32($ctx)
				1386	___
				1387	-$code.=<<___ if ($flavour !~ /elf32/);
				1388	+$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
				1389	mov %r10,0(%rdx)
				1390	mov %r11,8(%rdx)
				1391	___
				1392	-$code.=<<___ if ($flavour =~ /elf32/);
				1393	+$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
				1394	mov %r10d,0(%rdx)
				1395	mov %r11d,4(%rdx)
				1396	___
				1397	@@ -219,11 +298,11 @@ $code.=<<___;
				1398	mov \$1,%eax
				1399	.Lno_key:
				1400	ret
				1401	-.size poly1305_init,.-poly1305_init
				1402	+___
				1403	+&end_function("poly1305_init_x86_64");
				1404
				1405	-.type poly1305_blocks,\@function,4
				1406	-.align 32
				1407	-poly1305_blocks:
				1408	+&declare_function("poly1305_blocks_x86_64", 32, 4);
				1409	+$code.=<<___;
				1410	.cfi_startproc
				1411	.Lblocks:
				1412	shr \$4,$len
				1413	@@ -231,8 +310,6 @@ poly1305_blocks:
				1414
				1415	push %rbx
				1416	.cfi_push %rbx
				1417	- push %rbp
				1418	-.cfi_push %rbp
				1419	push %r12
				1420	.cfi_push %r12
				1421	push %r13
				1422	@@ -241,6 +318,8 @@ poly1305_blocks:
				1423	.cfi_push %r14
				1424	push %r15
				1425	.cfi_push %r15
				1426	+ push $ctx
				1427	+.cfi_push $ctx
				1428	.Lblocks_body:
				1429
				1430	mov $len,%r15 # reassign $len
				1431	@@ -265,26 +344,29 @@ poly1305_blocks:
				1432	lea 16($inp),$inp
				1433	adc $padbit,$h2
				1434	___
				1435	+
				1436	&poly1305_iteration();
				1437	+
				1438	$code.=<<___;
				1439	mov $r1,%rax
				1440	dec %r15 # len-=16
				1441	jnz .Loop
				1442
				1443	+ mov 0(%rsp),$ctx
				1444	+.cfi_restore $ctx
				1445	+
				1446	mov $h0,0($ctx) # store hash value
				1447	mov $h1,8($ctx)
				1448	mov $h2,16($ctx)
				1449
				1450	- mov 0(%rsp),%r15
				1451	+ mov 8(%rsp),%r15
				1452	.cfi_restore %r15
				1453	- mov 8(%rsp),%r14
				1454	+ mov 16(%rsp),%r14
				1455	.cfi_restore %r14
				1456	- mov 16(%rsp),%r13
				1457	+ mov 24(%rsp),%r13
				1458	.cfi_restore %r13
				1459	- mov 24(%rsp),%r12
				1460	+ mov 32(%rsp),%r12
				1461	.cfi_restore %r12
				1462	- mov 32(%rsp),%rbp
				1463	-.cfi_restore %rbp
				1464	mov 40(%rsp),%rbx
				1465	.cfi_restore %rbx
				1466	lea 48(%rsp),%rsp
				1467	@@ -293,11 +375,11 @@ $code.=<<___;
				1468	.Lblocks_epilogue:
				1469	ret
				1470	.cfi_endproc
				1471	-.size poly1305_blocks,.-poly1305_blocks
				1472	+___
				1473	+&end_function("poly1305_blocks_x86_64");
				1474
				1475	-.type poly1305_emit,\@function,3
				1476	-.align 32
				1477	-poly1305_emit:
				1478	+&declare_function("poly1305_emit_x86_64", 32, 3);
				1479	+$code.=<<___;
				1480	.Lemit:
				1481	mov 0($ctx),%r8 # load hash value
				1482	mov 8($ctx),%r9
				1483	@@ -318,10 +400,14 @@ poly1305_emit:
				1484	mov %rcx,8($mac)
				1485
				1486	ret
				1487	-.size poly1305_emit,.-poly1305_emit
				1488	___
				1489	+&end_function("poly1305_emit_x86_64");
				1490	if ($avx) {
				1491
				1492	+if($kernel) {
				1493	+ $code .= "#ifdef CONFIG_AS_AVX\n";
				1494	+}
				1495	+
				1496	########################################################################
				1497	# Layout of opaque area is following.
				1498	#
				1499	@@ -342,15 +428,19 @@ $code.=<<___;
				1500	.type __poly1305_block,\@abi-omnipotent
				1501	.align 32
				1502	__poly1305_block:
				1503	+ push $ctx
				1504	___
				1505	&poly1305_iteration();
				1506	$code.=<<___;
				1507	+ pop $ctx
				1508	ret
				1509	.size __poly1305_block,.-__poly1305_block
				1510
				1511	.type __poly1305_init_avx,\@abi-omnipotent
				1512	.align 32
				1513	__poly1305_init_avx:
				1514	+ push %rbp
				1515	+ mov %rsp,%rbp
				1516	mov $r0,$h0
				1517	mov $r1,$h1
				1518	xor $h2,$h2
				1519	@@ -507,12 +597,13 @@ __poly1305_init_avx:
				1520	mov $d1#d,`16*8+8-64`($ctx)
				1521
				1522	lea -48-64($ctx),$ctx # size [de-]optimization
				1523	+ pop %rbp
				1524	ret
				1525	.size __poly1305_init_avx,.-__poly1305_init_avx
				1526	+___
				1527
				1528	-.type poly1305_blocks_avx,\@function,4
				1529	-.align 32
				1530	-poly1305_blocks_avx:
				1531	+&declare_function("poly1305_blocks_avx", 32, 4);
				1532	+$code.=<<___;
				1533	.cfi_startproc
				1534	mov 20($ctx),%r8d # is_base2_26
				1535	cmp \$128,$len
				1536	@@ -532,10 +623,11 @@ poly1305_blocks_avx:
				1537	test \$31,$len
				1538	jz .Leven_avx
				1539
				1540	- push %rbx
				1541	-.cfi_push %rbx
				1542	push %rbp
				1543	.cfi_push %rbp
				1544	+ mov %rsp,%rbp
				1545	+ push %rbx
				1546	+.cfi_push %rbx
				1547	push %r12
				1548	.cfi_push %r12
				1549	push %r13
				1550	@@ -645,20 +737,18 @@ poly1305_blocks_avx:
				1551	mov $h2#d,16($ctx)
				1552	.align 16
				1553	.Ldone_avx:
				1554	- mov 0(%rsp),%r15
				1555	+ pop %r15
				1556	.cfi_restore %r15
				1557	- mov 8(%rsp),%r14
				1558	+ pop %r14
				1559	.cfi_restore %r14
				1560	- mov 16(%rsp),%r13
				1561	+ pop %r13
				1562	.cfi_restore %r13
				1563	- mov 24(%rsp),%r12
				1564	+ pop %r12
				1565	.cfi_restore %r12
				1566	- mov 32(%rsp),%rbp
				1567	-.cfi_restore %rbp
				1568	- mov 40(%rsp),%rbx
				1569	+ pop %rbx
				1570	.cfi_restore %rbx
				1571	- lea 48(%rsp),%rsp
				1572	-.cfi_adjust_cfa_offset -48
				1573	+ pop %rbp
				1574	+.cfi_restore %rbp
				1575	.Lno_data_avx:
				1576	.Lblocks_avx_epilogue:
				1577	ret
				1578	@@ -667,10 +757,11 @@ poly1305_blocks_avx:
				1579	.align 32
				1580	.Lbase2_64_avx:
				1581	.cfi_startproc
				1582	- push %rbx
				1583	-.cfi_push %rbx
				1584	push %rbp
				1585	.cfi_push %rbp
				1586	+ mov %rsp,%rbp
				1587	+ push %rbx
				1588	+.cfi_push %rbx
				1589	push %r12
				1590	.cfi_push %r12
				1591	push %r13
				1592	@@ -736,22 +827,18 @@ poly1305_blocks_avx:
				1593
				1594	.Lproceed_avx:
				1595	mov %r15,$len
				1596	-
				1597	- mov 0(%rsp),%r15
				1598	+ pop %r15
				1599	.cfi_restore %r15
				1600	- mov 8(%rsp),%r14
				1601	+ pop %r14
				1602	.cfi_restore %r14
				1603	- mov 16(%rsp),%r13
				1604	+ pop %r13
				1605	.cfi_restore %r13
				1606	- mov 24(%rsp),%r12
				1607	+ pop %r12
				1608	.cfi_restore %r12
				1609	- mov 32(%rsp),%rbp
				1610	-.cfi_restore %rbp
				1611	- mov 40(%rsp),%rbx
				1612	+ pop %rbx
				1613	.cfi_restore %rbx
				1614	- lea 48(%rsp),%rax
				1615	- lea 48(%rsp),%rsp
				1616	-.cfi_adjust_cfa_offset -48
				1617	+ pop %rbp
				1618	+.cfi_restore %rbp
				1619	.Lbase2_64_avx_epilogue:
				1620	jmp .Ldo_avx
				1621	.cfi_endproc
				1622	@@ -768,8 +855,11 @@ poly1305_blocks_avx:
				1623	.Ldo_avx:
				1624	___
				1625	$code.=<<___ if (!$win64);
				1626	+ lea 8(%rsp),%r10
				1627	+.cfi_def_cfa_register %r10
				1628	+ and \$-32,%rsp
				1629	+ sub \$-8,%rsp
				1630	lea -0x58(%rsp),%r11
				1631	-.cfi_def_cfa %r11,0x60
				1632	sub \$0x178,%rsp
				1633	___
				1634	$code.=<<___ if ($win64);
				1635	@@ -1361,18 +1451,18 @@ $code.=<<___ if ($win64);
				1636	.Ldo_avx_epilogue:
				1637	___
				1638	$code.=<<___ if (!$win64);
				1639	- lea 0x58(%r11),%rsp
				1640	-.cfi_def_cfa %rsp,8
				1641	+ lea -8(%r10),%rsp
				1642	+.cfi_def_cfa_register %rsp
				1643	___
				1644	$code.=<<___;
				1645	vzeroupper
				1646	ret
				1647	.cfi_endproc
				1648	-.size poly1305_blocks_avx,.-poly1305_blocks_avx
				1649	+___
				1650	+&end_function("poly1305_blocks_avx");
				1651
				1652	-.type poly1305_emit_avx,\@function,3
				1653	-.align 32
				1654	-poly1305_emit_avx:
				1655	+&declare_function("poly1305_emit_avx", 32, 3);
				1656	+$code.=<<___;
				1657	cmpl \$0,20($ctx) # is_base2_26?
				1658	je .Lemit
				1659
				1660	@@ -1423,41 +1513,51 @@ poly1305_emit_avx:
				1661	mov %rcx,8($mac)
				1662
				1663	ret
				1664	-.size poly1305_emit_avx,.-poly1305_emit_avx
				1665	___
				1666	+&end_function("poly1305_emit_avx");
				1667	+
				1668	+if ($kernel) {
				1669	+ $code .= "#endif\n";
				1670	+}
				1671
				1672	if ($avx>1) {
				1673	+
				1674	+if ($kernel) {
				1675	+ $code .= "#ifdef CONFIG_AS_AVX2\n";
				1676	+}
				1677	+
				1678	my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
				1679	map("%ymm$_",(0..15));
				1680	my $S4=$MASK;
				1681
				1682	+sub poly1305_blocks_avxN {
				1683	+ my ($avx512) = @_;
				1684	+ my $suffix = $avx512 ? "_avx512" : "";
				1685	$code.=<<___;
				1686	-.type poly1305_blocks_avx2,\@function,4
				1687	-.align 32
				1688	-poly1305_blocks_avx2:
				1689	.cfi_startproc
				1690	mov 20($ctx),%r8d # is_base2_26
				1691	cmp \$128,$len
				1692	- jae .Lblocks_avx2
				1693	+ jae .Lblocks_avx2$suffix
				1694	test %r8d,%r8d
				1695	jz .Lblocks
				1696
				1697	-.Lblocks_avx2:
				1698	+.Lblocks_avx2$suffix:
				1699	and \$-16,$len
				1700	- jz .Lno_data_avx2
				1701	+ jz .Lno_data_avx2$suffix
				1702
				1703	vzeroupper
				1704
				1705	test %r8d,%r8d
				1706	- jz .Lbase2_64_avx2
				1707	+ jz .Lbase2_64_avx2$suffix
				1708
				1709	test \$63,$len
				1710	- jz .Leven_avx2
				1711	+ jz .Leven_avx2$suffix
				1712
				1713	- push %rbx
				1714	-.cfi_push %rbx
				1715	push %rbp
				1716	.cfi_push %rbp
				1717	+ mov %rsp,%rbp
				1718	+ push %rbx
				1719	+.cfi_push %rbx
				1720	push %r12
				1721	.cfi_push %r12
				1722	push %r13
				1723	@@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
				1724	.cfi_push %r14
				1725	push %r15
				1726	.cfi_push %r15
				1727	-.Lblocks_avx2_body:
				1728	+.Lblocks_avx2_body$suffix:
				1729
				1730	mov $len,%r15 # reassign $len
				1731
				1732	@@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
				1733	shr \$2,$s1
				1734	add $r1,$s1 # s1 = r1 + (r1 >> 2)
				1735
				1736	-.Lbase2_26_pre_avx2:
				1737	+.Lbase2_26_pre_avx2$suffix:
				1738	add 0($inp),$h0 # accumulate input
				1739	adc 8($inp),$h1
				1740	lea 16($inp),$inp
				1741	@@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
				1742	mov $r1,%rax
				1743
				1744	test \$63,%r15
				1745	- jnz .Lbase2_26_pre_avx2
				1746	+ jnz .Lbase2_26_pre_avx2$suffix
				1747
				1748	test $padbit,$padbit # if $padbit is zero,
				1749	- jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
				1750	+ jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
				1751
				1752	################################# base 2^64 -> base 2^26
				1753	mov $h0,%rax
				1754	@@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
				1755	or $r1,$h2 # h[4]
				1756
				1757	test %r15,%r15
				1758	- jz .Lstore_base2_26_avx2
				1759	+ jz .Lstore_base2_26_avx2$suffix
				1760
				1761	vmovd %rax#d,%x#$H0
				1762	vmovd %rdx#d,%x#$H1
				1763	vmovd $h0#d,%x#$H2
				1764	vmovd $h1#d,%x#$H3
				1765	vmovd $h2#d,%x#$H4
				1766	- jmp .Lproceed_avx2
				1767	+ jmp .Lproceed_avx2$suffix
				1768
				1769	.align 32
				1770	-.Lstore_base2_64_avx2:
				1771	+.Lstore_base2_64_avx2$suffix:
				1772	mov $h0,0($ctx)
				1773	mov $h1,8($ctx)
				1774	mov $h2,16($ctx) # note that is_base2_26 is zeroed
				1775	- jmp .Ldone_avx2
				1776	+ jmp .Ldone_avx2$suffix
				1777
				1778	.align 16
				1779	-.Lstore_base2_26_avx2:
				1780	+.Lstore_base2_26_avx2$suffix:
				1781	mov %rax#d,0($ctx) # store hash value base 2^26
				1782	mov %rdx#d,4($ctx)
				1783	mov $h0#d,8($ctx)
				1784	mov $h1#d,12($ctx)
				1785	mov $h2#d,16($ctx)
				1786	.align 16
				1787	-.Ldone_avx2:
				1788	- mov 0(%rsp),%r15
				1789	+.Ldone_avx2$suffix:
				1790	+ pop %r15
				1791	.cfi_restore %r15
				1792	- mov 8(%rsp),%r14
				1793	+ pop %r14
				1794	.cfi_restore %r14
				1795	- mov 16(%rsp),%r13
				1796	+ pop %r13
				1797	.cfi_restore %r13
				1798	- mov 24(%rsp),%r12
				1799	+ pop %r12
				1800	.cfi_restore %r12
				1801	- mov 32(%rsp),%rbp
				1802	-.cfi_restore %rbp
				1803	- mov 40(%rsp),%rbx
				1804	+ pop %rbx
				1805	.cfi_restore %rbx
				1806	- lea 48(%rsp),%rsp
				1807	-.cfi_adjust_cfa_offset -48
				1808	-.Lno_data_avx2:
				1809	-.Lblocks_avx2_epilogue:
				1810	+ pop %rbp
				1811	+.cfi_restore %rbp
				1812	+.Lno_data_avx2$suffix:
				1813	+.Lblocks_avx2_epilogue$suffix:
				1814	ret
				1815	.cfi_endproc
				1816
				1817	.align 32
				1818	-.Lbase2_64_avx2:
				1819	+.Lbase2_64_avx2$suffix:
				1820	.cfi_startproc
				1821	- push %rbx
				1822	-.cfi_push %rbx
				1823	push %rbp
				1824	.cfi_push %rbp
				1825	+ mov %rsp,%rbp
				1826	+ push %rbx
				1827	+.cfi_push %rbx
				1828	push %r12
				1829	.cfi_push %r12
				1830	push %r13
				1831	@@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
				1832	.cfi_push %r14
				1833	push %r15
				1834	.cfi_push %r15
				1835	-.Lbase2_64_avx2_body:
				1836	+.Lbase2_64_avx2_body$suffix:
				1837
				1838	mov $len,%r15 # reassign $len
				1839
				1840	@@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
				1841	add $r1,$s1 # s1 = r1 + (r1 >> 2)
				1842
				1843	test \$63,$len
				1844	- jz .Linit_avx2
				1845	+ jz .Linit_avx2$suffix
				1846
				1847	-.Lbase2_64_pre_avx2:
				1848	+.Lbase2_64_pre_avx2$suffix:
				1849	add 0($inp),$h0 # accumulate input
				1850	adc 8($inp),$h1
				1851	lea 16($inp),$inp
				1852	@@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
				1853	mov $r1,%rax
				1854
				1855	test \$63,%r15
				1856	- jnz .Lbase2_64_pre_avx2
				1857	+ jnz .Lbase2_64_pre_avx2$suffix
				1858
				1859	-.Linit_avx2:
				1860	+.Linit_avx2$suffix:
				1861	################################# base 2^64 -> base 2^26
				1862	mov $h0,%rax
				1863	mov $h0,%rdx
				1864	@@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
				1865
				1866	call __poly1305_init_avx
				1867
				1868	-.Lproceed_avx2:
				1869	+.Lproceed_avx2$suffix:
				1870	mov %r15,$len # restore $len
				1871	- mov OPENSSL_ia32cap_P+8(%rip),%r10d
				1872	+___
				1873	+$code.=<<___ if (!$kernel);
				1874	+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
				1875	mov \$`(1<<31\|1<<30\|1<<16)`,%r11d
				1876	-
				1877	- mov 0(%rsp),%r15
				1878	+___
				1879	+$code.=<<___;
				1880	+ pop %r15
				1881	.cfi_restore %r15
				1882	- mov 8(%rsp),%r14
				1883	+ pop %r14
				1884	.cfi_restore %r14
				1885	- mov 16(%rsp),%r13
				1886	+ pop %r13
				1887	.cfi_restore %r13
				1888	- mov 24(%rsp),%r12
				1889	+ pop %r12
				1890	.cfi_restore %r12
				1891	- mov 32(%rsp),%rbp
				1892	-.cfi_restore %rbp
				1893	- mov 40(%rsp),%rbx
				1894	+ pop %rbx
				1895	.cfi_restore %rbx
				1896	- lea 48(%rsp),%rax
				1897	- lea 48(%rsp),%rsp
				1898	-.cfi_adjust_cfa_offset -48
				1899	-.Lbase2_64_avx2_epilogue:
				1900	- jmp .Ldo_avx2
				1901	+ pop %rbp
				1902	+.cfi_restore %rbp
				1903	+.Lbase2_64_avx2_epilogue$suffix:
				1904	+ jmp .Ldo_avx2$suffix
				1905	.cfi_endproc
				1906
				1907	.align 32
				1908	-.Leven_avx2:
				1909	+.Leven_avx2$suffix:
				1910	.cfi_startproc
				1911	- mov OPENSSL_ia32cap_P+8(%rip),%r10d
				1912	+___
				1913	+$code.=<<___ if (!$kernel);
				1914	+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
				1915	+___
				1916	+$code.=<<___;
				1917	vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
				1918	vmovd 4*1($ctx),%x#$H1
				1919	vmovd 4*2($ctx),%x#$H2
				1920	vmovd 4*3($ctx),%x#$H3
				1921	vmovd 4*4($ctx),%x#$H4
				1922
				1923	-.Ldo_avx2:
				1924	+.Ldo_avx2$suffix:
				1925	___
				1926	-$code.=<<___ if ($avx>2);
				1927	+$code.=<<___ if (!$kernel && $avx>2);
				1928	cmp \$512,$len
				1929	jb .Lskip_avx512
				1930	- and %r11d,%r10d
				1931	- test \$`1<<16`,%r10d # check for AVX512F
				1932	+ and %r11d,%r9d
				1933	+ test \$`1<<16`,%r9d # check for AVX512F
				1934	jnz .Lblocks_avx512
				1935	-.Lskip_avx512:
				1936	+.Lskip_avx512$suffix:
				1937	+___
				1938	+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
				1939	+ cmp \$512,$len
				1940	+ jae .Lblocks_avx512
				1941	___
				1942	$code.=<<___ if (!$win64);
				1943	- lea -8(%rsp),%r11
				1944	-.cfi_def_cfa %r11,16
				1945	+ lea 8(%rsp),%r10
				1946	+.cfi_def_cfa_register %r10
				1947	sub \$0x128,%rsp
				1948	___
				1949	$code.=<<___ if ($win64);
				1950	- lea -0xf8(%rsp),%r11
				1951	+ lea 8(%rsp),%r10
				1952	sub \$0x1c8,%rsp
				1953	- vmovdqa %xmm6,0x50(%r11)
				1954	- vmovdqa %xmm7,0x60(%r11)
				1955	- vmovdqa %xmm8,0x70(%r11)
				1956	- vmovdqa %xmm9,0x80(%r11)
				1957	- vmovdqa %xmm10,0x90(%r11)
				1958	- vmovdqa %xmm11,0xa0(%r11)
				1959	- vmovdqa %xmm12,0xb0(%r11)
				1960	- vmovdqa %xmm13,0xc0(%r11)
				1961	- vmovdqa %xmm14,0xd0(%r11)
				1962	- vmovdqa %xmm15,0xe0(%r11)
				1963	-.Ldo_avx2_body:
				1964	+ vmovdqa %xmm6,-0xb0(%r10)
				1965	+ vmovdqa %xmm7,-0xa0(%r10)
				1966	+ vmovdqa %xmm8,-0x90(%r10)
				1967	+ vmovdqa %xmm9,-0x80(%r10)
				1968	+ vmovdqa %xmm10,-0x70(%r10)
				1969	+ vmovdqa %xmm11,-0x60(%r10)
				1970	+ vmovdqa %xmm12,-0x50(%r10)
				1971	+ vmovdqa %xmm13,-0x40(%r10)
				1972	+ vmovdqa %xmm14,-0x30(%r10)
				1973	+ vmovdqa %xmm15,-0x20(%r10)
				1974	+.Ldo_avx2_body$suffix:
				1975	___
				1976	$code.=<<___;
				1977	lea .Lconst(%rip),%rcx
				1978	@@ -1794,11 +1901,11 @@ $code.=<<___;
				1979
				1980	vpaddq $H2,$T2,$H2 # accumulate input
				1981	sub \$64,$len
				1982	- jz .Ltail_avx2
				1983	- jmp .Loop_avx2
				1984	+ jz .Ltail_avx2$suffix
				1985	+ jmp .Loop_avx2$suffix
				1986
				1987	.align 32
				1988	-.Loop_avx2:
				1989	+.Loop_avx2$suffix:
				1990	################################################################
				1991	# ((inp[0]r^4+inp[4])r^4+inp[ 8])*r^4
				1992	# ((inp[1]r^4+inp[5])r^4+inp[ 9])*r^3
				1993	@@ -1946,10 +2053,10 @@ $code.=<<___;
				1994	vpor 32(%rcx),$T4,$T4 # padbit, yes, always
				1995
				1996	sub \$64,$len
				1997	- jnz .Loop_avx2
				1998	+ jnz .Loop_avx2$suffix
				1999
				2000	.byte 0x66,0x90
				2001	-.Ltail_avx2:
				2002	+.Ltail_avx2$suffix:
				2003	################################################################
				2004	# while above multiplications were by r^4 in all lanes, in last
				2005	# iteration we multiply least significant lane by r^4 and most
				2006	@@ -2087,37 +2194,29 @@ $code.=<<___;
				2007	vmovd %x#$H4,`4*4-48-64`($ctx)
				2008	___
				2009	$code.=<<___ if ($win64);
				2010	- vmovdqa 0x50(%r11),%xmm6
				2011	- vmovdqa 0x60(%r11),%xmm7
				2012	- vmovdqa 0x70(%r11),%xmm8
				2013	- vmovdqa 0x80(%r11),%xmm9
				2014	- vmovdqa 0x90(%r11),%xmm10
				2015	- vmovdqa 0xa0(%r11),%xmm11
				2016	- vmovdqa 0xb0(%r11),%xmm12
				2017	- vmovdqa 0xc0(%r11),%xmm13
				2018	- vmovdqa 0xd0(%r11),%xmm14
				2019	- vmovdqa 0xe0(%r11),%xmm15
				2020	- lea 0xf8(%r11),%rsp
				2021	-.Ldo_avx2_epilogue:
				2022	+ vmovdqa -0xb0(%r10),%xmm6
				2023	+ vmovdqa -0xa0(%r10),%xmm7
				2024	+ vmovdqa -0x90(%r10),%xmm8
				2025	+ vmovdqa -0x80(%r10),%xmm9
				2026	+ vmovdqa -0x70(%r10),%xmm10
				2027	+ vmovdqa -0x60(%r10),%xmm11
				2028	+ vmovdqa -0x50(%r10),%xmm12
				2029	+ vmovdqa -0x40(%r10),%xmm13
				2030	+ vmovdqa -0x30(%r10),%xmm14
				2031	+ vmovdqa -0x20(%r10),%xmm15
				2032	+ lea -8(%r10),%rsp
				2033	+.Ldo_avx2_epilogue$suffix:
				2034	___
				2035	$code.=<<___ if (!$win64);
				2036	- lea 8(%r11),%rsp
				2037	-.cfi_def_cfa %rsp,8
				2038	+ lea -8(%r10),%rsp
				2039	+.cfi_def_cfa_register %rsp
				2040	___
				2041	$code.=<<___;
				2042	vzeroupper
				2043	ret
				2044	.cfi_endproc
				2045	-.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
				2046	___
				2047	-#######################################################################
				2048	-if ($avx>2) {
				2049	-# On entry we have input length divisible by 64. But since inner loop
				2050	-# processes 128 bytes per iteration, cases when length is not divisible
				2051	-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
				2052	-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
				2053	-# for this tail, we wouldn't have to even allocate stack frame...
				2054	-
				2055	+if($avx > 2 && $avx512) {
				2056	my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
				2057	my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
				2058	my $PADBIT="%zmm30";
				2059	@@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
				2060	map(s/%y/%z/,($MASK));
				2061
				2062	$code.=<<___;
				2063	-.type poly1305_blocks_avx512,\@function,4
				2064	-.align 32
				2065	-poly1305_blocks_avx512:
				2066	.cfi_startproc
				2067	.Lblocks_avx512:
				2068	mov \$15,%eax
				2069	kmovw %eax,%k2
				2070	___
				2071	$code.=<<___ if (!$win64);
				2072	- lea -8(%rsp),%r11
				2073	-.cfi_def_cfa %r11,16
				2074	+ lea 8(%rsp),%r10
				2075	+.cfi_def_cfa_register %r10
				2076	sub \$0x128,%rsp
				2077	___
				2078	$code.=<<___ if ($win64);
				2079	- lea -0xf8(%rsp),%r11
				2080	+ lea 8(%rsp),%r10
				2081	sub \$0x1c8,%rsp
				2082	- vmovdqa %xmm6,0x50(%r11)
				2083	- vmovdqa %xmm7,0x60(%r11)
				2084	- vmovdqa %xmm8,0x70(%r11)
				2085	- vmovdqa %xmm9,0x80(%r11)
				2086	- vmovdqa %xmm10,0x90(%r11)
				2087	- vmovdqa %xmm11,0xa0(%r11)
				2088	- vmovdqa %xmm12,0xb0(%r11)
				2089	- vmovdqa %xmm13,0xc0(%r11)
				2090	- vmovdqa %xmm14,0xd0(%r11)
				2091	- vmovdqa %xmm15,0xe0(%r11)
				2092	+ vmovdqa %xmm6,-0xb0(%r10)
				2093	+ vmovdqa %xmm7,-0xa0(%r10)
				2094	+ vmovdqa %xmm8,-0x90(%r10)
				2095	+ vmovdqa %xmm9,-0x80(%r10)
				2096	+ vmovdqa %xmm10,-0x70(%r10)
				2097	+ vmovdqa %xmm11,-0x60(%r10)
				2098	+ vmovdqa %xmm12,-0x50(%r10)
				2099	+ vmovdqa %xmm13,-0x40(%r10)
				2100	+ vmovdqa %xmm14,-0x30(%r10)
				2101	+ vmovdqa %xmm15,-0x20(%r10)
				2102	.Ldo_avx512_body:
				2103	___
				2104	$code.=<<___;
				2105	@@ -2679,7 +2775,7 @@ $code.=<<___;
				2106
				2107	lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
				2108	add \$64,$len
				2109	- jnz .Ltail_avx2
				2110	+ jnz .Ltail_avx2$suffix
				2111
				2112	vpsubq $T2,$H2,$H2 # undo input accumulation
				2113	vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
				2114	@@ -2690,29 +2786,61 @@ $code.=<<___;
				2115	vzeroall
				2116	___
				2117	$code.=<<___ if ($win64);
				2118	- movdqa 0x50(%r11),%xmm6
				2119	- movdqa 0x60(%r11),%xmm7
				2120	- movdqa 0x70(%r11),%xmm8
				2121	- movdqa 0x80(%r11),%xmm9
				2122	- movdqa 0x90(%r11),%xmm10
				2123	- movdqa 0xa0(%r11),%xmm11
				2124	- movdqa 0xb0(%r11),%xmm12
				2125	- movdqa 0xc0(%r11),%xmm13
				2126	- movdqa 0xd0(%r11),%xmm14
				2127	- movdqa 0xe0(%r11),%xmm15
				2128	- lea 0xf8(%r11),%rsp
				2129	+ movdqa -0xb0(%r10),%xmm6
				2130	+ movdqa -0xa0(%r10),%xmm7
				2131	+ movdqa -0x90(%r10),%xmm8
				2132	+ movdqa -0x80(%r10),%xmm9
				2133	+ movdqa -0x70(%r10),%xmm10
				2134	+ movdqa -0x60(%r10),%xmm11
				2135	+ movdqa -0x50(%r10),%xmm12
				2136	+ movdqa -0x40(%r10),%xmm13
				2137	+ movdqa -0x30(%r10),%xmm14
				2138	+ movdqa -0x20(%r10),%xmm15
				2139	+ lea -8(%r10),%rsp
				2140	.Ldo_avx512_epilogue:
				2141	___
				2142	$code.=<<___ if (!$win64);
				2143	- lea 8(%r11),%rsp
				2144	-.cfi_def_cfa %rsp,8
				2145	+ lea -8(%r10),%rsp
				2146	+.cfi_def_cfa_register %rsp
				2147	___
				2148	$code.=<<___;
				2149	ret
				2150	.cfi_endproc
				2151	-.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
				2152	___
				2153	-if ($avx>3) {
				2154	+
				2155	+}
				2156	+
				2157	+}
				2158	+
				2159	+&declare_function("poly1305_blocks_avx2", 32, 4);
				2160	+poly1305_blocks_avxN(0);
				2161	+&end_function("poly1305_blocks_avx2");
				2162	+
				2163	+if($kernel) {
				2164	+ $code .= "#endif\n";
				2165	+}
				2166	+
				2167	+#######################################################################
				2168	+if ($avx>2) {
				2169	+# On entry we have input length divisible by 64. But since inner loop
				2170	+# processes 128 bytes per iteration, cases when length is not divisible
				2171	+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
				2172	+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
				2173	+# for this tail, we wouldn't have to even allocate stack frame...
				2174	+
				2175	+if($kernel) {
				2176	+ $code .= "#ifdef CONFIG_AS_AVX512\n";
				2177	+}
				2178	+
				2179	+&declare_function("poly1305_blocks_avx512", 32, 4);
				2180	+poly1305_blocks_avxN(1);
				2181	+&end_function("poly1305_blocks_avx512");
				2182	+
				2183	+if ($kernel) {
				2184	+ $code .= "#endif\n";
				2185	+}
				2186	+
				2187	+if (!$kernel && $avx>3) {
				2188	########################################################################
				2189	# VPMADD52 version using 2^44 radix.
				2190	#
				2191	@@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
				2192	.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
				2193	___
				2194	} } }
				2195	-$code.=<<___;
				2196	-.align 64
				2197	-.Lconst:
				2198	-.Lmask24:
				2199	-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
				2200	-.L129:
				2201	-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
				2202	-.Lmask26:
				2203	-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
				2204	-.Lpermd_avx2:
				2205	-.long 2,2,2,3,2,0,2,1
				2206	-.Lpermd_avx512:
				2207	-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
				2208	-
				2209	-.L2_44_inp_permd:
				2210	-.long 0,1,1,2,2,3,7,7
				2211	-.L2_44_inp_shift:
				2212	-.quad 0,12,24,64
				2213	-.L2_44_mask:
				2214	-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
				2215	-.L2_44_shift_rgt:
				2216	-.quad 44,44,42,64
				2217	-.L2_44_shift_lft:
				2218	-.quad 8,8,10,64
				2219	-
				2220	-.align 64
				2221	-.Lx_mask44:
				2222	-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
				2223	-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
				2224	-.Lx_mask42:
				2225	-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
				2226	-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
				2227	-___
				2228	}
				2229	-$code.=<<___;
				2230	-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
				2231	-.align 16
				2232	-___
				2233
				2234	+if (!$kernel)
				2235	{ # chacha20-poly1305 helpers
				2236	my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
				2237	("%rdi","%rsi","%rdx","%rcx"); # Unix order
				2238	@@ -4038,17 +4130,17 @@ avx_handler:
				2239
				2240	.section .pdata
				2241	.align 4
				2242	- .rva .LSEH_begin_poly1305_init
				2243	- .rva .LSEH_end_poly1305_init
				2244	- .rva .LSEH_info_poly1305_init
				2245	-
				2246	- .rva .LSEH_begin_poly1305_blocks
				2247	- .rva .LSEH_end_poly1305_blocks
				2248	- .rva .LSEH_info_poly1305_blocks
				2249	-
				2250	- .rva .LSEH_begin_poly1305_emit
				2251	- .rva .LSEH_end_poly1305_emit
				2252	- .rva .LSEH_info_poly1305_emit
				2253	+ .rva .LSEH_begin_poly1305_init_x86_64
				2254	+ .rva .LSEH_end_poly1305_init_x86_64
				2255	+ .rva .LSEH_info_poly1305_init_x86_64
				2256	+
				2257	+ .rva .LSEH_begin_poly1305_blocks_x86_64
				2258	+ .rva .LSEH_end_poly1305_blocks_x86_64
				2259	+ .rva .LSEH_info_poly1305_blocks_x86_64
				2260	+
				2261	+ .rva .LSEH_begin_poly1305_emit_x86_64
				2262	+ .rva .LSEH_end_poly1305_emit_x86_64
				2263	+ .rva .LSEH_info_poly1305_emit_x86_64
				2264	___
				2265	$code.=<<___ if ($avx);
				2266	.rva .LSEH_begin_poly1305_blocks_avx
				2267	@@ -4088,20 +4180,20 @@ ___
				2268	$code.=<<___;
				2269	.section .xdata
				2270	.align 8
				2271	-.LSEH_info_poly1305_init:
				2272	+.LSEH_info_poly1305_init_x86_64:
				2273	.byte 9,0,0,0
				2274	.rva se_handler
				2275	- .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
				2276	+ .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
				2277
				2278	-.LSEH_info_poly1305_blocks:
				2279	+.LSEH_info_poly1305_blocks_x86_64:
				2280	.byte 9,0,0,0
				2281	.rva se_handler
				2282	.rva .Lblocks_body,.Lblocks_epilogue
				2283
				2284	-.LSEH_info_poly1305_emit:
				2285	+.LSEH_info_poly1305_emit_x86_64:
				2286	.byte 9,0,0,0
				2287	.rva se_handler
				2288	- .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
				2289	+ .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
				2290	___
				2291	$code.=<<___ if ($avx);
				2292	.LSEH_info_poly1305_blocks_avx_1:
				2293	@@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
				2294	___
				2295	}
				2296
				2297	+open SELF,$0;
				2298	+while(<SELF>) {
				2299	+ next if (/^#!/);
				2300	+ last if (!s/^#/\/\// and !/^$/);
				2301	+ print;
				2302	+}
				2303	+close SELF;
				2304	+
				2305	foreach (split('\n',$code)) {
				2306	s/\`([^\`]*)\`/eval($1)/ge;
				2307	s/%r([a-z]+)#d/%e$1/g;
				2308	s/%r([0-9]+)#d/%r$1d/g;
				2309	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
				2310
				2311	+ if ($kernel) {
				2312	+ s/(^\.type.*),[0-9]+$/\1/;
				2313	+ s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
				2314	+ next if /^\.cfi.*/;
				2315	+ }
				2316	+
				2317	print $_,"\n";
				2318	}
				2319	close STDOUT;
				2320	--- a/arch/x86/crypto/poly1305_glue.c
				2321	+++ b/arch/x86/crypto/poly1305_glue.c
				2322	@@ -1,8 +1,6 @@
				2323	-// SPDX-License-Identifier: GPL-2.0-or-later
				2324	+// SPDX-License-Identifier: GPL-2.0 OR MIT
				2325	/*
				2326	- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
				2327	- *
				2328	- * Copyright (C) 2015 Martin Willi
				2329	+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				2330	*/
				2331
				2332	#include <crypto/algapi.h>
				2333	@@ -13,279 +11,170 @@
				2334	#include <linux/jump_label.h>
				2335	#include <linux/kernel.h>
				2336	#include <linux/module.h>
				2337	+#include <asm/intel-family.h>
				2338	#include <asm/simd.h>
				2339
				2340	-asmlinkage void poly1305_block_sse2(u32 h, const u8 src,
				2341	- const u32 *r, unsigned int blocks);
				2342	-asmlinkage void poly1305_2block_sse2(u32 h, const u8 src, const u32 *r,
				2343	- unsigned int blocks, const u32 *u);
				2344	-asmlinkage void poly1305_4block_avx2(u32 h, const u8 src, const u32 *r,
				2345	- unsigned int blocks, const u32 *u);
				2346	+asmlinkage void poly1305_init_x86_64(void *ctx,
				2347	+ const u8 key[POLY1305_KEY_SIZE]);
				2348	+asmlinkage void poly1305_blocks_x86_64(void ctx, const u8 inp,
				2349	+ const size_t len, const u32 padbit);
				2350	+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
				2351	+ const u32 nonce[4]);
				2352	+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
				2353	+ const u32 nonce[4]);
				2354	+asmlinkage void poly1305_blocks_avx(void ctx, const u8 inp, const size_t len,
				2355	+ const u32 padbit);
				2356	+asmlinkage void poly1305_blocks_avx2(void ctx, const u8 inp, const size_t len,
				2357	+ const u32 padbit);
				2358	+asmlinkage void poly1305_blocks_avx512(void ctx, const u8 inp,
				2359	+ const size_t len, const u32 padbit);
				2360
				2361	-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
				2362	+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
				2363	static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
				2364	+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
				2365
				2366	-static inline u64 mlt(u64 a, u64 b)
				2367	-{
				2368	- return a * b;
				2369	-}
				2370	-
				2371	-static inline u32 sr(u64 v, u_char n)
				2372	-{
				2373	- return v >> n;
				2374	-}
				2375	-
				2376	-static inline u32 and(u32 v, u32 mask)
				2377	-{
				2378	- return v & mask;
				2379	-}
				2380	-
				2381	-static void poly1305_simd_mult(u32 a, const u32 b)
				2382	-{
				2383	- u8 m[POLY1305_BLOCK_SIZE];
				2384	-
				2385	- memset(m, 0, sizeof(m));
				2386	- /* The poly1305 block function adds a hi-bit to the accumulator which
				2387	- * we don't need for key multiplication; compensate for it. */
				2388	- a[4] -= 1 << 24;
				2389	- poly1305_block_sse2(a, m, b, 1);
				2390	-}
				2391	-
				2392	-static void poly1305_integer_setkey(struct poly1305_key key, const u8 raw_key)
				2393	-{
				2394	- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
				2395	- key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
				2396	- key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
				2397	- key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
				2398	- key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
				2399	- key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
				2400	-}
				2401	+struct poly1305_arch_internal {
				2402	+ union {
				2403	+ struct {
				2404	+ u32 h[5];
				2405	+ u32 is_base2_26;
				2406	+ };
				2407	+ u64 hs[3];
				2408	+ };
				2409	+ u64 r[2];
				2410	+ u64 pad;
				2411	+ struct { u32 r2, r1, r4, r3; } rn[9];
				2412	+};
				2413
				2414	-static void poly1305_integer_blocks(struct poly1305_state *state,
				2415	- const struct poly1305_key *key,
				2416	- const void *src,
				2417	- unsigned int nblocks, u32 hibit)
				2418	+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
				2419	+ * the unfortunate situation of using AVX and then having to go back to scalar
				2420	+ * -- because the user is silly and has called the update function from two
				2421	+ * separate contexts -- then we need to convert back to the original base before
				2422	+ * proceeding. It is possible to reason that the initial reduction below is
				2423	+ * sufficient given the implementation invariants. However, for an avoidance of
				2424	+ * doubt and because this is not performance critical, we do the full reduction
				2425	+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
				2426	+ */
				2427	+static void convert_to_base2_64(void *ctx)
				2428	{
				2429	- u32 r0, r1, r2, r3, r4;
				2430	- u32 s1, s2, s3, s4;
				2431	- u32 h0, h1, h2, h3, h4;
				2432	- u64 d0, d1, d2, d3, d4;
				2433	+ struct poly1305_arch_internal *state = ctx;
				2434	+ u32 cy;
				2435
				2436	- if (!nblocks)
				2437	+ if (!state->is_base2_26)
				2438	return;
				2439
				2440	- r0 = key->r[0];
				2441	- r1 = key->r[1];
				2442	- r2 = key->r[2];
				2443	- r3 = key->r[3];
				2444	- r4 = key->r[4];
				2445	-
				2446	- s1 = r1 * 5;
				2447	- s2 = r2 * 5;
				2448	- s3 = r3 * 5;
				2449	- s4 = r4 * 5;
				2450	-
				2451	- h0 = state->h[0];
				2452	- h1 = state->h[1];
				2453	- h2 = state->h[2];
				2454	- h3 = state->h[3];
				2455	- h4 = state->h[4];
				2456	-
				2457	- do {
				2458	- /* h += m[i] */
				2459	- h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
				2460	- h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
				2461	- h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
				2462	- h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
				2463	- h4 += (get_unaligned_le32(src + 12) >> 8) \| (hibit << 24);
				2464	-
				2465	- /* h = r /
				2466	- d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
				2467	- mlt(h3, s2) + mlt(h4, s1);
				2468	- d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
				2469	- mlt(h3, s3) + mlt(h4, s2);
				2470	- d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
				2471	- mlt(h3, s4) + mlt(h4, s3);
				2472	- d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
				2473	- mlt(h3, r0) + mlt(h4, s4);
				2474	- d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
				2475	- mlt(h3, r1) + mlt(h4, r0);
				2476	-
				2477	- /* (partial) h %= p */
				2478	- d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
				2479	- d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
				2480	- d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
				2481	- d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
				2482	- h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
				2483	- h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
				2484	-
				2485	- src += POLY1305_BLOCK_SIZE;
				2486	- } while (--nblocks);
				2487	-
				2488	- state->h[0] = h0;
				2489	- state->h[1] = h1;
				2490	- state->h[2] = h2;
				2491	- state->h[3] = h3;
				2492	- state->h[4] = h4;
				2493	-}
				2494	-
				2495	-static void poly1305_integer_emit(const struct poly1305_state state, void dst)
				2496	-{
				2497	- u32 h0, h1, h2, h3, h4;
				2498	- u32 g0, g1, g2, g3, g4;
				2499	- u32 mask;
				2500	-
				2501	- /* fully carry h */
				2502	- h0 = state->h[0];
				2503	- h1 = state->h[1];
				2504	- h2 = state->h[2];
				2505	- h3 = state->h[3];
				2506	- h4 = state->h[4];
				2507	-
				2508	- h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
				2509	- h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
				2510	- h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
				2511	- h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
				2512	- h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
				2513	-
				2514	- /* compute h + -p */
				2515	- g0 = h0 + 5;
				2516	- g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
				2517	- g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
				2518	- g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
				2519	- g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
				2520	-
				2521	- /* select h if h < p, or h + -p if h >= p */
				2522	- mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
				2523	- g0 &= mask;
				2524	- g1 &= mask;
				2525	- g2 &= mask;
				2526	- g3 &= mask;
				2527	- g4 &= mask;
				2528	- mask = ~mask;
				2529	- h0 = (h0 & mask) \| g0;
				2530	- h1 = (h1 & mask) \| g1;
				2531	- h2 = (h2 & mask) \| g2;
				2532	- h3 = (h3 & mask) \| g3;
				2533	- h4 = (h4 & mask) \| g4;
				2534	-
				2535	- /* h = h % (2^128) */
				2536	- put_unaligned_le32((h0 >> 0) \| (h1 << 26), dst + 0);
				2537	- put_unaligned_le32((h1 >> 6) \| (h2 << 20), dst + 4);
				2538	- put_unaligned_le32((h2 >> 12) \| (h3 << 14), dst + 8);
				2539	- put_unaligned_le32((h3 >> 18) \| (h4 << 8), dst + 12);
				2540	-}
				2541	-
				2542	-void poly1305_init_arch(struct poly1305_desc_ctx desc, const u8 key)
				2543	-{
				2544	- poly1305_integer_setkey(desc->opaque_r, key);
				2545	- desc->s[0] = get_unaligned_le32(key + 16);
				2546	- desc->s[1] = get_unaligned_le32(key + 20);
				2547	- desc->s[2] = get_unaligned_le32(key + 24);
				2548	- desc->s[3] = get_unaligned_le32(key + 28);
				2549	- poly1305_core_init(&desc->h);
				2550	- desc->buflen = 0;
				2551	- desc->sset = true;
				2552	- desc->rset = 1;
				2553	-}
				2554	-EXPORT_SYMBOL_GPL(poly1305_init_arch);
				2555	-
				2556	-static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
				2557	- const u8 *src, unsigned int srclen)
				2558	-{
				2559	- if (!dctx->sset) {
				2560	- if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
				2561	- poly1305_integer_setkey(dctx->r, src);
				2562	- src += POLY1305_BLOCK_SIZE;
				2563	- srclen -= POLY1305_BLOCK_SIZE;
				2564	- dctx->rset = 1;
				2565	- }
				2566	- if (srclen >= POLY1305_BLOCK_SIZE) {
				2567	- dctx->s[0] = get_unaligned_le32(src + 0);
				2568	- dctx->s[1] = get_unaligned_le32(src + 4);
				2569	- dctx->s[2] = get_unaligned_le32(src + 8);
				2570	- dctx->s[3] = get_unaligned_le32(src + 12);
				2571	- src += POLY1305_BLOCK_SIZE;
				2572	- srclen -= POLY1305_BLOCK_SIZE;
				2573	- dctx->sset = true;
				2574	- }
				2575	+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
				2576	+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
				2577	+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
				2578	+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
				2579	+ state->hs[0] = ((u64)state->h[2] << 52) \| ((u64)state->h[1] << 26) \| state->h[0];
				2580	+ state->hs[1] = ((u64)state->h[4] << 40) \| ((u64)state->h[3] << 14) \| (state->h[2] >> 12);
				2581	+ state->hs[2] = state->h[4] >> 24;
				2582	+#define ULT(a, b) ((a ^ ((a ^ b) \| ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
				2583	+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
				2584	+ state->hs[2] &= 3;
				2585	+ state->hs[0] += cy;
				2586	+ state->hs[1] += (cy = ULT(state->hs[0], cy));
				2587	+ state->hs[2] += ULT(state->hs[1], cy);
				2588	+#undef ULT
				2589	+ state->is_base2_26 = 0;
				2590	+}
				2591	+
				2592	+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
				2593	+{
				2594	+ poly1305_init_x86_64(ctx, key);
				2595	+}
				2596	+
				2597	+static void poly1305_simd_blocks(void ctx, const u8 inp, size_t len,
				2598	+ const u32 padbit)
				2599	+{
				2600	+ struct poly1305_arch_internal *state = ctx;
				2601	+
				2602	+ /* SIMD disables preemption, so relax after processing each page. */
				2603	+ BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE \|\|
				2604	+ PAGE_SIZE % POLY1305_BLOCK_SIZE);
				2605	+
				2606	+ if (!IS_ENABLED(CONFIG_AS_AVX) \|\| !static_branch_likely(&poly1305_use_avx) \|\|
				2607	+ (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) \|\|
				2608	+ !crypto_simd_usable()) {
				2609	+ convert_to_base2_64(ctx);
				2610	+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
				2611	+ return;
				2612	}
				2613	- return srclen;
				2614	-}
				2615
				2616	-static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
				2617	- const u8 *src, unsigned int srclen)
				2618	-{
				2619	- unsigned int datalen;
				2620	+ for (;;) {
				2621	+ const size_t bytes = min_t(size_t, len, PAGE_SIZE);
				2622
				2623	- if (unlikely(!dctx->sset)) {
				2624	- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
				2625	- src += srclen - datalen;
				2626	- srclen = datalen;
				2627	- }
				2628	- if (srclen >= POLY1305_BLOCK_SIZE) {
				2629	- poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
				2630	- srclen / POLY1305_BLOCK_SIZE, 1);
				2631	- srclen %= POLY1305_BLOCK_SIZE;
				2632	+ kernel_fpu_begin();
				2633	+ if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
				2634	+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
				2635	+ else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
				2636	+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
				2637	+ else
				2638	+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
				2639	+ kernel_fpu_end();
				2640	+ len -= bytes;
				2641	+ if (!len)
				2642	+ break;
				2643	+ inp += bytes;
				2644	}
				2645	- return srclen;
				2646	}
				2647
				2648	-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
				2649	- const u8 *src, unsigned int srclen)
				2650	-{
				2651	- unsigned int blocks, datalen;
				2652	+static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
				2653	+ const u32 nonce[4])
				2654	+{
				2655	+ struct poly1305_arch_internal *state = ctx;
				2656	+
				2657	+ if (!IS_ENABLED(CONFIG_AS_AVX) \|\| !static_branch_likely(&poly1305_use_avx) \|\|
				2658	+ !state->is_base2_26 \|\| !crypto_simd_usable()) {
				2659	+ convert_to_base2_64(ctx);
				2660	+ poly1305_emit_x86_64(ctx, mac, nonce);
				2661	+ } else
				2662	+ poly1305_emit_avx(ctx, mac, nonce);
				2663	+}
				2664	+
				2665	+void poly1305_init_arch(struct poly1305_desc_ctx dctx, const u8 key)
				2666	+{
				2667	+ poly1305_simd_init(&dctx->h, key);
				2668	+ dctx->s[0] = get_unaligned_le32(&key[16]);
				2669	+ dctx->s[1] = get_unaligned_le32(&key[20]);
				2670	+ dctx->s[2] = get_unaligned_le32(&key[24]);
				2671	+ dctx->s[3] = get_unaligned_le32(&key[28]);
				2672	+ dctx->buflen = 0;
				2673	+ dctx->sset = true;
				2674	+}
				2675	+EXPORT_SYMBOL(poly1305_init_arch);
				2676
				2677	+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
				2678	+ const u8 *inp, unsigned int len)
				2679	+{
				2680	+ unsigned int acc = 0;
				2681	if (unlikely(!dctx->sset)) {
				2682	- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
				2683	- src += srclen - datalen;
				2684	- srclen = datalen;
				2685	- }
				2686	-
				2687	- if (IS_ENABLED(CONFIG_AS_AVX2) &&
				2688	- static_branch_likely(&poly1305_use_avx2) &&
				2689	- srclen >= POLY1305_BLOCK_SIZE * 4) {
				2690	- if (unlikely(dctx->rset < 4)) {
				2691	- if (dctx->rset < 2) {
				2692	- dctx->r[1] = dctx->r[0];
				2693	- poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
				2694	- }
				2695	- dctx->r[2] = dctx->r[1];
				2696	- poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
				2697	- dctx->r[3] = dctx->r[2];
				2698	- poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
				2699	- dctx->rset = 4;
				2700	+ if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
				2701	+ poly1305_simd_init(&dctx->h, inp);
				2702	+ inp += POLY1305_BLOCK_SIZE;
				2703	+ len -= POLY1305_BLOCK_SIZE;
				2704	+ acc += POLY1305_BLOCK_SIZE;
				2705	+ dctx->rset = 1;
				2706	}
				2707	- blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
				2708	- poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
				2709	- dctx->r[1].r);
				2710	- src += POLY1305_BLOCK_SIZE * 4 * blocks;
				2711	- srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
				2712	- }
				2713	-
				2714	- if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
				2715	- if (unlikely(dctx->rset < 2)) {
				2716	- dctx->r[1] = dctx->r[0];
				2717	- poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
				2718	- dctx->rset = 2;
				2719	+ if (len >= POLY1305_BLOCK_SIZE) {
				2720	+ dctx->s[0] = get_unaligned_le32(&inp[0]);
				2721	+ dctx->s[1] = get_unaligned_le32(&inp[4]);
				2722	+ dctx->s[2] = get_unaligned_le32(&inp[8]);
				2723	+ dctx->s[3] = get_unaligned_le32(&inp[12]);
				2724	+ inp += POLY1305_BLOCK_SIZE;
				2725	+ len -= POLY1305_BLOCK_SIZE;
				2726	+ acc += POLY1305_BLOCK_SIZE;
				2727	+ dctx->sset = true;
				2728	}
				2729	- blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
				2730	- poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
				2731	- blocks, dctx->r[1].r);
				2732	- src += POLY1305_BLOCK_SIZE * 2 * blocks;
				2733	- srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
				2734	- }
				2735	- if (srclen >= POLY1305_BLOCK_SIZE) {
				2736	- poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
				2737	- srclen -= POLY1305_BLOCK_SIZE;
				2738	}
				2739	- return srclen;
				2740	+ return acc;
				2741	}
				2742
				2743	void poly1305_update_arch(struct poly1305_desc_ctx dctx, const u8 src,
				2744	unsigned int srclen)
				2745	{
				2746	- unsigned int bytes;
				2747	+ unsigned int bytes, used;
				2748
				2749	if (unlikely(dctx->buflen)) {
				2750	bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
				2751	@@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
				2752	dctx->buflen += bytes;
				2753
				2754	if (dctx->buflen == POLY1305_BLOCK_SIZE) {
				2755	- if (static_branch_likely(&poly1305_use_simd) &&
				2756	- likely(crypto_simd_usable())) {
				2757	- kernel_fpu_begin();
				2758	- poly1305_simd_blocks(dctx, dctx->buf,
				2759	- POLY1305_BLOCK_SIZE);
				2760	- kernel_fpu_end();
				2761	- } else {
				2762	- poly1305_scalar_blocks(dctx, dctx->buf,
				2763	- POLY1305_BLOCK_SIZE);
				2764	- }
				2765	+ if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
				2766	+ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
				2767	dctx->buflen = 0;
				2768	}
				2769	}
				2770
				2771	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
				2772	- if (static_branch_likely(&poly1305_use_simd) &&
				2773	- likely(crypto_simd_usable())) {
				2774	- kernel_fpu_begin();
				2775	- bytes = poly1305_simd_blocks(dctx, src, srclen);
				2776	- kernel_fpu_end();
				2777	- } else {
				2778	- bytes = poly1305_scalar_blocks(dctx, src, srclen);
				2779	- }
				2780	- src += srclen - bytes;
				2781	- srclen = bytes;
				2782	+ bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
				2783	+ srclen -= bytes;
				2784	+ used = crypto_poly1305_setdctxkey(dctx, src, bytes);
				2785	+ if (likely(bytes - used))
				2786	+ poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
				2787	+ src += bytes;
				2788	}
				2789
				2790	if (unlikely(srclen)) {
				2791	@@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
				2792	}
				2793	EXPORT_SYMBOL(poly1305_update_arch);
				2794
				2795	-void poly1305_final_arch(struct poly1305_desc_ctx desc, u8 dst)
				2796	+void poly1305_final_arch(struct poly1305_desc_ctx dctx, u8 dst)
				2797	{
				2798	- __le32 digest[4];
				2799	- u64 f = 0;
				2800	-
				2801	- if (unlikely(desc->buflen)) {
				2802	- desc->buf[desc->buflen++] = 1;
				2803	- memset(desc->buf + desc->buflen, 0,
				2804	- POLY1305_BLOCK_SIZE - desc->buflen);
				2805	- poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
				2806	+ if (unlikely(dctx->buflen)) {
				2807	+ dctx->buf[dctx->buflen++] = 1;
				2808	+ memset(dctx->buf + dctx->buflen, 0,
				2809	+ POLY1305_BLOCK_SIZE - dctx->buflen);
				2810	+ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
				2811	}
				2812
				2813	- poly1305_integer_emit(&desc->h, digest);
				2814	-
				2815	- /* mac = (h + s) % (2^128) */
				2816	- f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
				2817	- put_unaligned_le32(f, dst + 0);
				2818	- f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
				2819	- put_unaligned_le32(f, dst + 4);
				2820	- f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
				2821	- put_unaligned_le32(f, dst + 8);
				2822	- f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
				2823	- put_unaligned_le32(f, dst + 12);
				2824	-
				2825	- *desc = (struct poly1305_desc_ctx){};
				2826	+ poly1305_simd_emit(&dctx->h, dst, dctx->s);
				2827	+ *dctx = (struct poly1305_desc_ctx){};
				2828	}
				2829	EXPORT_SYMBOL(poly1305_final_arch);
				2830
				2831	@@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
				2832	{
				2833	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
				2834
				2835	- poly1305_core_init(&dctx->h);
				2836	- dctx->buflen = 0;
				2837	- dctx->rset = 0;
				2838	- dctx->sset = false;
				2839	-
				2840	+ *dctx = (struct poly1305_desc_ctx){};
				2841	return 0;
				2842	}
				2843
				2844	-static int crypto_poly1305_final(struct shash_desc desc, u8 dst)
				2845	+static int crypto_poly1305_update(struct shash_desc *desc,
				2846	+ const u8 *src, unsigned int srclen)
				2847	{
				2848	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
				2849
				2850	- if (unlikely(!dctx->sset))
				2851	- return -ENOKEY;
				2852	-
				2853	- poly1305_final_arch(dctx, dst);
				2854	+ poly1305_update_arch(dctx, src, srclen);
				2855	return 0;
				2856	}
				2857
				2858	-static int poly1305_simd_update(struct shash_desc *desc,
				2859	- const u8 *src, unsigned int srclen)
				2860	+static int crypto_poly1305_final(struct shash_desc desc, u8 dst)
				2861	{
				2862	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
				2863
				2864	- poly1305_update_arch(dctx, src, srclen);
				2865	+ if (unlikely(!dctx->sset))
				2866	+ return -ENOKEY;
				2867	+
				2868	+ poly1305_final_arch(dctx, dst);
				2869	return 0;
				2870	}
				2871
				2872	static struct shash_alg alg = {
				2873	.digestsize = POLY1305_DIGEST_SIZE,
				2874	.init = crypto_poly1305_init,
				2875	- .update = poly1305_simd_update,
				2876	+ .update = crypto_poly1305_update,
				2877	.final = crypto_poly1305_final,
				2878	.descsize = sizeof(struct poly1305_desc_ctx),
				2879	.base = {
				2880	@@ -406,17 +265,19 @@ static struct shash_alg alg = {
				2881
				2882	static int __init poly1305_simd_mod_init(void)
				2883	{
				2884	- if (!boot_cpu_has(X86_FEATURE_XMM2))
				2885	- return 0;
				2886	-
				2887	- static_branch_enable(&poly1305_use_simd);
				2888	-
				2889	- if (IS_ENABLED(CONFIG_AS_AVX2) &&
				2890	- boot_cpu_has(X86_FEATURE_AVX) &&
				2891	+ if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
				2892	+ cpu_has_xfeatures(XFEATURE_MASK_SSE \| XFEATURE_MASK_YMM, NULL))
				2893	+ static_branch_enable(&poly1305_use_avx);
				2894	+ if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
				2895	boot_cpu_has(X86_FEATURE_AVX2) &&
				2896	cpu_has_xfeatures(XFEATURE_MASK_SSE \| XFEATURE_MASK_YMM, NULL))
				2897	static_branch_enable(&poly1305_use_avx2);
				2898	-
				2899	+ if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
				2900	+ boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
				2901	+ cpu_has_xfeatures(XFEATURE_MASK_SSE \| XFEATURE_MASK_YMM \| XFEATURE_MASK_AVX512, NULL) &&
				2902	+ /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
				2903	+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
				2904	+ static_branch_enable(&poly1305_use_avx512);
				2905	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
				2906	}
				2907
				2908	@@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
				2909	module_exit(poly1305_simd_mod_exit);
				2910
				2911	MODULE_LICENSE("GPL");
				2912	-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
				2913	+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
				2914	MODULE_DESCRIPTION("Poly1305 authenticator");
				2915	MODULE_ALIAS_CRYPTO("poly1305");
				2916	MODULE_ALIAS_CRYPTO("poly1305-simd");
				2917	--- a/lib/crypto/Kconfig
				2918	+++ b/lib/crypto/Kconfig
				2919	@@ -65,7 +65,7 @@ config CRYPTO_LIB_DES
				2920	config CRYPTO_LIB_POLY1305_RSIZE
				2921	int
				2922	default 2 if MIPS
				2923	- default 4 if X86_64
				2924	+ default 11 if X86_64
				2925	default 9 if ARM \|\| ARM64
				2926	default 1
				2927