blob: 49fd9707676e99fa9e8a0608b8b68201a2903361 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3Date: Fri, 8 Nov 2019 13:22:36 +0100
4Subject: [PATCH] crypto: curve25519 - x86_64 library and KPP implementations
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9commit bb611bdfd6be34d9f822c73305fcc83720499d38 upstream.
10
11This implementation is the fastest available x86_64 implementation, and
12unlike Sandy2x, it doesn't requie use of the floating point registers at
13all. Instead it makes use of BMI2 and ADX, available on recent
14microarchitectures. The implementation was written by Armando
15Faz-Hernández with contributions (upstream) from Samuel Neves and me,
16in addition to further changes in the kernel implementation from us.
17
18Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
19Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
20Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
21[ardb: - move to arch/x86/crypto
22 - wire into lib/crypto framework
23 - implement crypto API KPP hooks ]
24Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
25Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
26Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
27---
28 arch/x86/crypto/Makefile | 1 +
29 arch/x86/crypto/curve25519-x86_64.c | 2475 +++++++++++++++++++++++++++
30 crypto/Kconfig | 6 +
31 3 files changed, 2482 insertions(+)
32 create mode 100644 arch/x86/crypto/curve25519-x86_64.c
33
34--- a/arch/x86/crypto/Makefile
35+++ b/arch/x86/crypto/Makefile
36@@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2)
37
38 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
39 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
40+obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
41
42 # These modules require assembler to support AVX.
43 ifeq ($(avx_supported),yes)
44--- /dev/null
45+++ b/arch/x86/crypto/curve25519-x86_64.c
46@@ -0,0 +1,2475 @@
47+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
48+/*
49+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
50+ * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
51+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
52+ */
53+
54+#include <crypto/curve25519.h>
55+#include <crypto/internal/kpp.h>
56+
57+#include <linux/types.h>
58+#include <linux/jump_label.h>
59+#include <linux/kernel.h>
60+#include <linux/module.h>
61+
62+#include <asm/cpufeature.h>
63+#include <asm/processor.h>
64+
65+static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
66+static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
67+
68+enum { NUM_WORDS_ELTFP25519 = 4 };
69+typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
70+typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
71+
72+#define mul_eltfp25519_1w_adx(c, a, b) do { \
73+ mul_256x256_integer_adx(m.buffer, a, b); \
74+ red_eltfp25519_1w_adx(c, m.buffer); \
75+} while (0)
76+
77+#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
78+ mul_256x256_integer_bmi2(m.buffer, a, b); \
79+ red_eltfp25519_1w_bmi2(c, m.buffer); \
80+} while (0)
81+
82+#define sqr_eltfp25519_1w_adx(a) do { \
83+ sqr_256x256_integer_adx(m.buffer, a); \
84+ red_eltfp25519_1w_adx(a, m.buffer); \
85+} while (0)
86+
87+#define sqr_eltfp25519_1w_bmi2(a) do { \
88+ sqr_256x256_integer_bmi2(m.buffer, a); \
89+ red_eltfp25519_1w_bmi2(a, m.buffer); \
90+} while (0)
91+
92+#define mul_eltfp25519_2w_adx(c, a, b) do { \
93+ mul2_256x256_integer_adx(m.buffer, a, b); \
94+ red_eltfp25519_2w_adx(c, m.buffer); \
95+} while (0)
96+
97+#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
98+ mul2_256x256_integer_bmi2(m.buffer, a, b); \
99+ red_eltfp25519_2w_bmi2(c, m.buffer); \
100+} while (0)
101+
102+#define sqr_eltfp25519_2w_adx(a) do { \
103+ sqr2_256x256_integer_adx(m.buffer, a); \
104+ red_eltfp25519_2w_adx(a, m.buffer); \
105+} while (0)
106+
107+#define sqr_eltfp25519_2w_bmi2(a) do { \
108+ sqr2_256x256_integer_bmi2(m.buffer, a); \
109+ red_eltfp25519_2w_bmi2(a, m.buffer); \
110+} while (0)
111+
112+#define sqrn_eltfp25519_1w_adx(a, times) do { \
113+ int ____counter = (times); \
114+ while (____counter-- > 0) \
115+ sqr_eltfp25519_1w_adx(a); \
116+} while (0)
117+
118+#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
119+ int ____counter = (times); \
120+ while (____counter-- > 0) \
121+ sqr_eltfp25519_1w_bmi2(a); \
122+} while (0)
123+
124+#define copy_eltfp25519_1w(C, A) do { \
125+ (C)[0] = (A)[0]; \
126+ (C)[1] = (A)[1]; \
127+ (C)[2] = (A)[2]; \
128+ (C)[3] = (A)[3]; \
129+} while (0)
130+
131+#define setzero_eltfp25519_1w(C) do { \
132+ (C)[0] = 0; \
133+ (C)[1] = 0; \
134+ (C)[2] = 0; \
135+ (C)[3] = 0; \
136+} while (0)
137+
138+__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
139+ /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
140+ 0xffffffffffffffffUL, 0x5fffffffffffffffUL,
141+ /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
142+ 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
143+ /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
144+ 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
145+ /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
146+ 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
147+ /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
148+ 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
149+ /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
150+ 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
151+ /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
152+ 0xc1c20d06231f7614UL, 0x2938218da274f972UL,
153+ /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
154+ 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
155+ /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
156+ 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
157+ /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
158+ 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
159+ /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
160+ 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
161+ /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
162+ 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
163+ /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
164+ 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
165+ /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
166+ 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
167+ /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
168+ 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
169+ /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
170+ 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
171+ /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
172+ 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
173+ /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
174+ 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
175+ /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
176+ 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
177+ /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
178+ 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
179+ /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
180+ 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
181+ /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
182+ 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
183+ /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
184+ 0x23758739f630a257UL, 0x295a407a01a78580UL,
185+ /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
186+ 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
187+ /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
188+ 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
189+ /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
190+ 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
191+ /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
192+ 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
193+ /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
194+ 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
195+ /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
196+ 0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
197+ /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
198+ 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
199+ /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
200+ 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
201+ /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
202+ 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
203+ /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
204+ 0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
205+ /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
206+ 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
207+ /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
208+ 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
209+ /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
210+ 0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
211+ /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
212+ 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
213+ /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
214+ 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
215+ /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
216+ 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
217+ /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
218+ 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
219+ /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
220+ 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
221+ /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
222+ 0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
223+ /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
224+ 0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
225+ /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
226+ 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
227+ /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
228+ 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
229+ /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
230+ 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
231+ /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
232+ 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
233+ /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
234+ 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
235+ /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
236+ 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
237+ /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
238+ 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
239+ /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
240+ 0xc189218075e91436UL, 0x6d9284169b3b8484UL,
241+ /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
242+ 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
243+ /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
244+ 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
245+ /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
246+ 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
247+ /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
248+ 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
249+ /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
250+ 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
251+ /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
252+ 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
253+ /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
254+ 0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
255+ /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
256+ 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
257+ /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
258+ 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
259+ /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
260+ 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
261+ /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
262+ 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
263+ /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
264+ 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
265+ /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
266+ 0x25232973322dbef4UL, 0x445dc4758c17f770UL,
267+ /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
268+ 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
269+ /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
270+ 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
271+ /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
272+ 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
273+ /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
274+ 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
275+ /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
276+ 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
277+ /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
278+ 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
279+ /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
280+ 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
281+ /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
282+ 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
283+ /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
284+ 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
285+ /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
286+ 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
287+ /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
288+ 0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
289+ /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
290+ 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
291+ /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
292+ 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
293+ /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
294+ 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
295+ /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
296+ 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
297+ /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
298+ 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
299+ /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
300+ 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
301+ /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
302+ 0x894d1d855ae52359UL, 0x68e122157b743d69UL,
303+ /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
304+ 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
305+ /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
306+ 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
307+ /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
308+ 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
309+ /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
310+ 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
311+ /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
312+ 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
313+ /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
314+ 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
315+ /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
316+ 0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
317+ /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
318+ 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
319+ /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
320+ 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
321+ /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
322+ 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
323+ /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
324+ 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
325+ /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
326+ 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
327+ /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
328+ 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
329+ /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
330+ 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
331+ /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
332+ 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
333+ /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
334+ 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
335+ /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
336+ 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
337+ /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
338+ 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
339+ /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
340+ 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
341+ /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
342+ 0x4a497962066e6043UL, 0x705b3aab41355b44UL,
343+ /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
344+ 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
345+ /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
346+ 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
347+ /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
348+ 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
349+ /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
350+ 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
351+ /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
352+ 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
353+ /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
354+ 0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
355+ /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
356+ 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
357+ /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
358+ 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
359+ /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
360+ 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
361+ /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
362+ 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
363+ /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
364+ 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
365+ /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
366+ 0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
367+ /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
368+ 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
369+ /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
370+ 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
371+ /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
372+ 0x508e862f121692fcUL, 0x3a81907fa093c291UL,
373+ /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
374+ 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
375+ /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
376+ 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
377+ /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
378+ 0xe488de11d761e352UL, 0x0e878a01a085545cUL,
379+ /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
380+ 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
381+ /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
382+ 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
383+ /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
384+ 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
385+ /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
386+ 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
387+ /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
388+ 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
389+ /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
390+ 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
391+ /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
392+ 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
393+ /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
394+ 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
395+ /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
396+ 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
397+ /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
398+ 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
399+ /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
400+ 0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
401+ /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
402+ 0x266fd5809208f294UL, 0x5c847085619a26b9UL,
403+ /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
404+ 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
405+ /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
406+ 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
407+ /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
408+ 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
409+ /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
410+ 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
411+ /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
412+ 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
413+ /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
414+ 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
415+ /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
416+ 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
417+ /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
418+ 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
419+ /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
420+ 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
421+ /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
422+ 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
423+ /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
424+ 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
425+ /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
426+ 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
427+ /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
428+ 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
429+ /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
430+ 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
431+ /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
432+ 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
433+ /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
434+ 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
435+ /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
436+ 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
437+ /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
438+ 0x52d17436309d4253UL, 0x356f97e13efae576UL,
439+ /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
440+ 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
441+ /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
442+ 0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
443+ /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
444+ 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
445+ /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
446+ 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
447+ /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
448+ 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
449+ /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
450+ 0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
451+ /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
452+ 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
453+ /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
454+ 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
455+ /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
456+ 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
457+ /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
458+ 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
459+ /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
460+ 0x497d723f802e88e1UL, 0x30684dea602f408dUL,
461+ /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
462+ 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
463+ /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
464+ 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
465+ /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
466+ 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
467+ /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
468+ 0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
469+ /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
470+ 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
471+ /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
472+ 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
473+ /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
474+ 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
475+ /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
476+ 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
477+ /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
478+ 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
479+ /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
480+ 0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
481+ /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
482+ 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
483+ /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
484+ 0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
485+ /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
486+ 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
487+ /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
488+ 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
489+ /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
490+ 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
491+ /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
492+ 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
493+ /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
494+ 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
495+ /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
496+ 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
497+ /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
498+ 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
499+ /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
500+ 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
501+ /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
502+ 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
503+ /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
504+ 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
505+ /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
506+ 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
507+ /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
508+ 0x81004b71e33cc191UL, 0x44e6be345122803cUL,
509+ /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
510+ 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
511+ /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
512+ 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
513+ /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
514+ 0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
515+ /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
516+ 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
517+ /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
518+ 0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
519+ /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
520+ 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
521+ /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
522+ 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
523+ /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
524+ 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
525+ /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
526+ 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
527+ /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
528+ 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
529+ /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
530+ 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
531+ /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
532+ 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
533+ /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
534+ 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
535+ /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
536+ 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
537+ /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
538+ 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
539+ /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
540+ 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
541+ /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
542+ 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
543+ /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
544+ 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
545+ /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
546+ 0x33979624f0e917beUL, 0x2c018dc527356b30UL,
547+ /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
548+ 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
549+ /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
550+ 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
551+ /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
552+ 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
553+ /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
554+ 0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
555+ /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
556+ 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
557+ /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
558+ 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
559+ /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
560+ 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
561+ /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
562+ 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
563+ /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
564+ 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
565+ /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
566+ 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
567+ /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
568+ 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
569+ /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
570+ 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
571+ /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
572+ 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
573+ /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
574+ 0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
575+ /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
576+ 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
577+ /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
578+ 0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
579+ /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
580+ 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
581+ /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
582+ 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
583+ /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
584+ 0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
585+ /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
586+ 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
587+ /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
588+ 0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
589+ /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
590+ 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
591+ /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
592+ 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
593+ /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
594+ 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
595+ /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
596+ 0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
597+ /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
598+ 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
599+ /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
600+ 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
601+ /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
602+ 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
603+ /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
604+ 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
605+ /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
606+ 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
607+ /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
608+ 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
609+ /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
610+ 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
611+ /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
612+ 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
613+ /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
614+ 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
615+ /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
616+ 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
617+ /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
618+ 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
619+ /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
620+ 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
621+ /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
622+ 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
623+ /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
624+ 0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
625+ /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
626+ 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
627+ /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
628+ 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
629+ /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
630+ 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
631+ /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
632+ 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
633+ /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
634+ 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
635+ /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
636+ 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
637+ /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
638+ 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
639+ /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
640+ 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
641+ /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
642+ 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
643+};
644+
645+/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
646+ * a is two 256-bit integers: a0[0:3] and a1[4:7]
647+ * b is two 256-bit integers: b0[0:3] and b1[4:7]
648+ */
649+static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
650+ const u64 *const b)
651+{
652+ asm volatile(
653+ "xorl %%r14d, %%r14d ;"
654+ "movq (%1), %%rdx; " /* A[0] */
655+ "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
656+ "xorl %%r10d, %%r10d ;"
657+ "movq %%r8, (%0) ;"
658+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
659+ "adox %%r10, %%r15 ;"
660+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
661+ "adox %%r8, %%rax ;"
662+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
663+ "adox %%r10, %%rbx ;"
664+ /******************************************/
665+ "adox %%r14, %%rcx ;"
666+
667+ "movq 8(%1), %%rdx; " /* A[1] */
668+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
669+ "adox %%r15, %%r8 ;"
670+ "movq %%r8, 8(%0) ;"
671+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
672+ "adox %%r10, %%r9 ;"
673+ "adcx %%r9, %%rax ;"
674+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
675+ "adox %%r8, %%r11 ;"
676+ "adcx %%r11, %%rbx ;"
677+ "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
678+ "adox %%r10, %%r13 ;"
679+ "adcx %%r13, %%rcx ;"
680+ /******************************************/
681+ "adox %%r14, %%r15 ;"
682+ "adcx %%r14, %%r15 ;"
683+
684+ "movq 16(%1), %%rdx; " /* A[2] */
685+ "xorl %%r10d, %%r10d ;"
686+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
687+ "adox %%rax, %%r8 ;"
688+ "movq %%r8, 16(%0) ;"
689+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
690+ "adox %%r10, %%r9 ;"
691+ "adcx %%r9, %%rbx ;"
692+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
693+ "adox %%r8, %%r11 ;"
694+ "adcx %%r11, %%rcx ;"
695+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
696+ "adox %%r10, %%r13 ;"
697+ "adcx %%r13, %%r15 ;"
698+ /******************************************/
699+ "adox %%r14, %%rax ;"
700+ "adcx %%r14, %%rax ;"
701+
702+ "movq 24(%1), %%rdx; " /* A[3] */
703+ "xorl %%r10d, %%r10d ;"
704+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
705+ "adox %%rbx, %%r8 ;"
706+ "movq %%r8, 24(%0) ;"
707+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
708+ "adox %%r10, %%r9 ;"
709+ "adcx %%r9, %%rcx ;"
710+ "movq %%rcx, 32(%0) ;"
711+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
712+ "adox %%r8, %%r11 ;"
713+ "adcx %%r11, %%r15 ;"
714+ "movq %%r15, 40(%0) ;"
715+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
716+ "adox %%r10, %%r13 ;"
717+ "adcx %%r13, %%rax ;"
718+ "movq %%rax, 48(%0) ;"
719+ /******************************************/
720+ "adox %%r14, %%rbx ;"
721+ "adcx %%r14, %%rbx ;"
722+ "movq %%rbx, 56(%0) ;"
723+
724+ "movq 32(%1), %%rdx; " /* C[0] */
725+ "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
726+ "xorl %%r10d, %%r10d ;"
727+ "movq %%r8, 64(%0);"
728+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
729+ "adox %%r10, %%r15 ;"
730+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
731+ "adox %%r8, %%rax ;"
732+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
733+ "adox %%r10, %%rbx ;"
734+ /******************************************/
735+ "adox %%r14, %%rcx ;"
736+
737+ "movq 40(%1), %%rdx; " /* C[1] */
738+ "xorl %%r10d, %%r10d ;"
739+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
740+ "adox %%r15, %%r8 ;"
741+ "movq %%r8, 72(%0);"
742+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
743+ "adox %%r10, %%r9 ;"
744+ "adcx %%r9, %%rax ;"
745+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
746+ "adox %%r8, %%r11 ;"
747+ "adcx %%r11, %%rbx ;"
748+ "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
749+ "adox %%r10, %%r13 ;"
750+ "adcx %%r13, %%rcx ;"
751+ /******************************************/
752+ "adox %%r14, %%r15 ;"
753+ "adcx %%r14, %%r15 ;"
754+
755+ "movq 48(%1), %%rdx; " /* C[2] */
756+ "xorl %%r10d, %%r10d ;"
757+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
758+ "adox %%rax, %%r8 ;"
759+ "movq %%r8, 80(%0);"
760+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
761+ "adox %%r10, %%r9 ;"
762+ "adcx %%r9, %%rbx ;"
763+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
764+ "adox %%r8, %%r11 ;"
765+ "adcx %%r11, %%rcx ;"
766+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
767+ "adox %%r10, %%r13 ;"
768+ "adcx %%r13, %%r15 ;"
769+ /******************************************/
770+ "adox %%r14, %%rax ;"
771+ "adcx %%r14, %%rax ;"
772+
773+ "movq 56(%1), %%rdx; " /* C[3] */
774+ "xorl %%r10d, %%r10d ;"
775+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
776+ "adox %%rbx, %%r8 ;"
777+ "movq %%r8, 88(%0);"
778+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
779+ "adox %%r10, %%r9 ;"
780+ "adcx %%r9, %%rcx ;"
781+ "movq %%rcx, 96(%0) ;"
782+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
783+ "adox %%r8, %%r11 ;"
784+ "adcx %%r11, %%r15 ;"
785+ "movq %%r15, 104(%0) ;"
786+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
787+ "adox %%r10, %%r13 ;"
788+ "adcx %%r13, %%rax ;"
789+ "movq %%rax, 112(%0) ;"
790+ /******************************************/
791+ "adox %%r14, %%rbx ;"
792+ "adcx %%r14, %%rbx ;"
793+ "movq %%rbx, 120(%0) ;"
794+ :
795+ : "r"(c), "r"(a), "r"(b)
796+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
797+ "%r10", "%r11", "%r13", "%r14", "%r15");
798+}
799+
800+static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
801+ const u64 *const b)
802+{
803+ asm volatile(
804+ "movq (%1), %%rdx; " /* A[0] */
805+ "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
806+ "movq %%r8, (%0) ;"
807+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
808+ "addq %%r10, %%r15 ;"
809+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
810+ "adcq %%r8, %%rax ;"
811+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
812+ "adcq %%r10, %%rbx ;"
813+ /******************************************/
814+ "adcq $0, %%rcx ;"
815+
816+ "movq 8(%1), %%rdx; " /* A[1] */
817+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
818+ "addq %%r15, %%r8 ;"
819+ "movq %%r8, 8(%0) ;"
820+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
821+ "adcq %%r10, %%r9 ;"
822+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
823+ "adcq %%r8, %%r11 ;"
824+ "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
825+ "adcq %%r10, %%r13 ;"
826+ /******************************************/
827+ "adcq $0, %%r15 ;"
828+
829+ "addq %%r9, %%rax ;"
830+ "adcq %%r11, %%rbx ;"
831+ "adcq %%r13, %%rcx ;"
832+ "adcq $0, %%r15 ;"
833+
834+ "movq 16(%1), %%rdx; " /* A[2] */
835+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
836+ "addq %%rax, %%r8 ;"
837+ "movq %%r8, 16(%0) ;"
838+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
839+ "adcq %%r10, %%r9 ;"
840+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
841+ "adcq %%r8, %%r11 ;"
842+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
843+ "adcq %%r10, %%r13 ;"
844+ /******************************************/
845+ "adcq $0, %%rax ;"
846+
847+ "addq %%r9, %%rbx ;"
848+ "adcq %%r11, %%rcx ;"
849+ "adcq %%r13, %%r15 ;"
850+ "adcq $0, %%rax ;"
851+
852+ "movq 24(%1), %%rdx; " /* A[3] */
853+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
854+ "addq %%rbx, %%r8 ;"
855+ "movq %%r8, 24(%0) ;"
856+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
857+ "adcq %%r10, %%r9 ;"
858+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
859+ "adcq %%r8, %%r11 ;"
860+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
861+ "adcq %%r10, %%r13 ;"
862+ /******************************************/
863+ "adcq $0, %%rbx ;"
864+
865+ "addq %%r9, %%rcx ;"
866+ "movq %%rcx, 32(%0) ;"
867+ "adcq %%r11, %%r15 ;"
868+ "movq %%r15, 40(%0) ;"
869+ "adcq %%r13, %%rax ;"
870+ "movq %%rax, 48(%0) ;"
871+ "adcq $0, %%rbx ;"
872+ "movq %%rbx, 56(%0) ;"
873+
874+ "movq 32(%1), %%rdx; " /* C[0] */
875+ "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
876+ "movq %%r8, 64(%0) ;"
877+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
878+ "addq %%r10, %%r15 ;"
879+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
880+ "adcq %%r8, %%rax ;"
881+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
882+ "adcq %%r10, %%rbx ;"
883+ /******************************************/
884+ "adcq $0, %%rcx ;"
885+
886+ "movq 40(%1), %%rdx; " /* C[1] */
887+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
888+ "addq %%r15, %%r8 ;"
889+ "movq %%r8, 72(%0) ;"
890+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
891+ "adcq %%r10, %%r9 ;"
892+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
893+ "adcq %%r8, %%r11 ;"
894+ "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
895+ "adcq %%r10, %%r13 ;"
896+ /******************************************/
897+ "adcq $0, %%r15 ;"
898+
899+ "addq %%r9, %%rax ;"
900+ "adcq %%r11, %%rbx ;"
901+ "adcq %%r13, %%rcx ;"
902+ "adcq $0, %%r15 ;"
903+
904+ "movq 48(%1), %%rdx; " /* C[2] */
905+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
906+ "addq %%rax, %%r8 ;"
907+ "movq %%r8, 80(%0) ;"
908+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
909+ "adcq %%r10, %%r9 ;"
910+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
911+ "adcq %%r8, %%r11 ;"
912+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
913+ "adcq %%r10, %%r13 ;"
914+ /******************************************/
915+ "adcq $0, %%rax ;"
916+
917+ "addq %%r9, %%rbx ;"
918+ "adcq %%r11, %%rcx ;"
919+ "adcq %%r13, %%r15 ;"
920+ "adcq $0, %%rax ;"
921+
922+ "movq 56(%1), %%rdx; " /* C[3] */
923+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
924+ "addq %%rbx, %%r8 ;"
925+ "movq %%r8, 88(%0) ;"
926+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
927+ "adcq %%r10, %%r9 ;"
928+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
929+ "adcq %%r8, %%r11 ;"
930+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
931+ "adcq %%r10, %%r13 ;"
932+ /******************************************/
933+ "adcq $0, %%rbx ;"
934+
935+ "addq %%r9, %%rcx ;"
936+ "movq %%rcx, 96(%0) ;"
937+ "adcq %%r11, %%r15 ;"
938+ "movq %%r15, 104(%0) ;"
939+ "adcq %%r13, %%rax ;"
940+ "movq %%rax, 112(%0) ;"
941+ "adcq $0, %%rbx ;"
942+ "movq %%rbx, 120(%0) ;"
943+ :
944+ : "r"(c), "r"(a), "r"(b)
945+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
946+ "%r10", "%r11", "%r13", "%r15");
947+}
948+
949+static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
950+{
951+ asm volatile(
952+ "movq (%1), %%rdx ;" /* A[0] */
953+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
954+ "xorl %%r15d, %%r15d;"
955+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
956+ "adcx %%r14, %%r9 ;"
957+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
958+ "adcx %%rax, %%r10 ;"
959+ "movq 24(%1), %%rdx ;" /* A[3] */
960+ "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
961+ "adcx %%rcx, %%r11 ;"
962+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
963+ "adcx %%rax, %%rbx ;"
964+ "movq 8(%1), %%rdx ;" /* A[1] */
965+ "adcx %%r15, %%r13 ;"
966+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
967+ "movq $0, %%r14 ;"
968+ /******************************************/
969+ "adcx %%r15, %%r14 ;"
970+
971+ "xorl %%r15d, %%r15d;"
972+ "adox %%rax, %%r10 ;"
973+ "adcx %%r8, %%r8 ;"
974+ "adox %%rcx, %%r11 ;"
975+ "adcx %%r9, %%r9 ;"
976+ "adox %%r15, %%rbx ;"
977+ "adcx %%r10, %%r10 ;"
978+ "adox %%r15, %%r13 ;"
979+ "adcx %%r11, %%r11 ;"
980+ "adox %%r15, %%r14 ;"
981+ "adcx %%rbx, %%rbx ;"
982+ "adcx %%r13, %%r13 ;"
983+ "adcx %%r14, %%r14 ;"
984+
985+ "movq (%1), %%rdx ;"
986+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
987+ /*******************/
988+ "movq %%rax, 0(%0) ;"
989+ "addq %%rcx, %%r8 ;"
990+ "movq %%r8, 8(%0) ;"
991+ "movq 8(%1), %%rdx ;"
992+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
993+ "adcq %%rax, %%r9 ;"
994+ "movq %%r9, 16(%0) ;"
995+ "adcq %%rcx, %%r10 ;"
996+ "movq %%r10, 24(%0) ;"
997+ "movq 16(%1), %%rdx ;"
998+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
999+ "adcq %%rax, %%r11 ;"
1000+ "movq %%r11, 32(%0) ;"
1001+ "adcq %%rcx, %%rbx ;"
1002+ "movq %%rbx, 40(%0) ;"
1003+ "movq 24(%1), %%rdx ;"
1004+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1005+ "adcq %%rax, %%r13 ;"
1006+ "movq %%r13, 48(%0) ;"
1007+ "adcq %%rcx, %%r14 ;"
1008+ "movq %%r14, 56(%0) ;"
1009+
1010+
1011+ "movq 32(%1), %%rdx ;" /* B[0] */
1012+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
1013+ "xorl %%r15d, %%r15d;"
1014+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
1015+ "adcx %%r14, %%r9 ;"
1016+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
1017+ "adcx %%rax, %%r10 ;"
1018+ "movq 56(%1), %%rdx ;" /* B[3] */
1019+ "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
1020+ "adcx %%rcx, %%r11 ;"
1021+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
1022+ "adcx %%rax, %%rbx ;"
1023+ "movq 40(%1), %%rdx ;" /* B[1] */
1024+ "adcx %%r15, %%r13 ;"
1025+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
1026+ "movq $0, %%r14 ;"
1027+ /******************************************/
1028+ "adcx %%r15, %%r14 ;"
1029+
1030+ "xorl %%r15d, %%r15d;"
1031+ "adox %%rax, %%r10 ;"
1032+ "adcx %%r8, %%r8 ;"
1033+ "adox %%rcx, %%r11 ;"
1034+ "adcx %%r9, %%r9 ;"
1035+ "adox %%r15, %%rbx ;"
1036+ "adcx %%r10, %%r10 ;"
1037+ "adox %%r15, %%r13 ;"
1038+ "adcx %%r11, %%r11 ;"
1039+ "adox %%r15, %%r14 ;"
1040+ "adcx %%rbx, %%rbx ;"
1041+ "adcx %%r13, %%r13 ;"
1042+ "adcx %%r14, %%r14 ;"
1043+
1044+ "movq 32(%1), %%rdx ;"
1045+ "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
1046+ /*******************/
1047+ "movq %%rax, 64(%0) ;"
1048+ "addq %%rcx, %%r8 ;"
1049+ "movq %%r8, 72(%0) ;"
1050+ "movq 40(%1), %%rdx ;"
1051+ "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
1052+ "adcq %%rax, %%r9 ;"
1053+ "movq %%r9, 80(%0) ;"
1054+ "adcq %%rcx, %%r10 ;"
1055+ "movq %%r10, 88(%0) ;"
1056+ "movq 48(%1), %%rdx ;"
1057+ "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
1058+ "adcq %%rax, %%r11 ;"
1059+ "movq %%r11, 96(%0) ;"
1060+ "adcq %%rcx, %%rbx ;"
1061+ "movq %%rbx, 104(%0) ;"
1062+ "movq 56(%1), %%rdx ;"
1063+ "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
1064+ "adcq %%rax, %%r13 ;"
1065+ "movq %%r13, 112(%0) ;"
1066+ "adcq %%rcx, %%r14 ;"
1067+ "movq %%r14, 120(%0) ;"
1068+ :
1069+ : "r"(c), "r"(a)
1070+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1071+ "%r10", "%r11", "%r13", "%r14", "%r15");
1072+}
1073+
1074+static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1075+{
1076+ asm volatile(
1077+ "movq 8(%1), %%rdx ;" /* A[1] */
1078+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
1079+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1080+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1081+
1082+ "movq 16(%1), %%rdx ;" /* A[2] */
1083+ "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1084+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1085+
1086+ "addq %%rax, %%r9 ;"
1087+ "adcq %%rdx, %%r10 ;"
1088+ "adcq %%rcx, %%r11 ;"
1089+ "adcq %%r14, %%r15 ;"
1090+ "adcq $0, %%r13 ;"
1091+ "movq $0, %%r14 ;"
1092+ "adcq $0, %%r14 ;"
1093+
1094+ "movq (%1), %%rdx ;" /* A[0] */
1095+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1096+
1097+ "addq %%rax, %%r10 ;"
1098+ "adcq %%rcx, %%r11 ;"
1099+ "adcq $0, %%r15 ;"
1100+ "adcq $0, %%r13 ;"
1101+ "adcq $0, %%r14 ;"
1102+
1103+ "shldq $1, %%r13, %%r14 ;"
1104+ "shldq $1, %%r15, %%r13 ;"
1105+ "shldq $1, %%r11, %%r15 ;"
1106+ "shldq $1, %%r10, %%r11 ;"
1107+ "shldq $1, %%r9, %%r10 ;"
1108+ "shldq $1, %%r8, %%r9 ;"
1109+ "shlq $1, %%r8 ;"
1110+
1111+ /*******************/
1112+ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
1113+ /*******************/
1114+ "movq %%rax, 0(%0) ;"
1115+ "addq %%rcx, %%r8 ;"
1116+ "movq %%r8, 8(%0) ;"
1117+ "movq 8(%1), %%rdx ;"
1118+ "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
1119+ "adcq %%rax, %%r9 ;"
1120+ "movq %%r9, 16(%0) ;"
1121+ "adcq %%rcx, %%r10 ;"
1122+ "movq %%r10, 24(%0) ;"
1123+ "movq 16(%1), %%rdx ;"
1124+ "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
1125+ "adcq %%rax, %%r11 ;"
1126+ "movq %%r11, 32(%0) ;"
1127+ "adcq %%rcx, %%r15 ;"
1128+ "movq %%r15, 40(%0) ;"
1129+ "movq 24(%1), %%rdx ;"
1130+ "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
1131+ "adcq %%rax, %%r13 ;"
1132+ "movq %%r13, 48(%0) ;"
1133+ "adcq %%rcx, %%r14 ;"
1134+ "movq %%r14, 56(%0) ;"
1135+
1136+ "movq 40(%1), %%rdx ;" /* B[1] */
1137+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
1138+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
1139+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
1140+
1141+ "movq 48(%1), %%rdx ;" /* B[2] */
1142+ "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
1143+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
1144+
1145+ "addq %%rax, %%r9 ;"
1146+ "adcq %%rdx, %%r10 ;"
1147+ "adcq %%rcx, %%r11 ;"
1148+ "adcq %%r14, %%r15 ;"
1149+ "adcq $0, %%r13 ;"
1150+ "movq $0, %%r14 ;"
1151+ "adcq $0, %%r14 ;"
1152+
1153+ "movq 32(%1), %%rdx ;" /* B[0] */
1154+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
1155+
1156+ "addq %%rax, %%r10 ;"
1157+ "adcq %%rcx, %%r11 ;"
1158+ "adcq $0, %%r15 ;"
1159+ "adcq $0, %%r13 ;"
1160+ "adcq $0, %%r14 ;"
1161+
1162+ "shldq $1, %%r13, %%r14 ;"
1163+ "shldq $1, %%r15, %%r13 ;"
1164+ "shldq $1, %%r11, %%r15 ;"
1165+ "shldq $1, %%r10, %%r11 ;"
1166+ "shldq $1, %%r9, %%r10 ;"
1167+ "shldq $1, %%r8, %%r9 ;"
1168+ "shlq $1, %%r8 ;"
1169+
1170+ /*******************/
1171+ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
1172+ /*******************/
1173+ "movq %%rax, 64(%0) ;"
1174+ "addq %%rcx, %%r8 ;"
1175+ "movq %%r8, 72(%0) ;"
1176+ "movq 40(%1), %%rdx ;"
1177+ "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
1178+ "adcq %%rax, %%r9 ;"
1179+ "movq %%r9, 80(%0) ;"
1180+ "adcq %%rcx, %%r10 ;"
1181+ "movq %%r10, 88(%0) ;"
1182+ "movq 48(%1), %%rdx ;"
1183+ "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
1184+ "adcq %%rax, %%r11 ;"
1185+ "movq %%r11, 96(%0) ;"
1186+ "adcq %%rcx, %%r15 ;"
1187+ "movq %%r15, 104(%0) ;"
1188+ "movq 56(%1), %%rdx ;"
1189+ "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
1190+ "adcq %%rax, %%r13 ;"
1191+ "movq %%r13, 112(%0) ;"
1192+ "adcq %%rcx, %%r14 ;"
1193+ "movq %%r14, 120(%0) ;"
1194+ :
1195+ : "r"(c), "r"(a)
1196+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1197+ "%r11", "%r13", "%r14", "%r15");
1198+}
1199+
1200+static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
1201+{
1202+ asm volatile(
1203+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
1204+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
1205+ "xorl %%ebx, %%ebx ;"
1206+ "adox (%1), %%r8 ;"
1207+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
1208+ "adcx %%r10, %%r9 ;"
1209+ "adox 8(%1), %%r9 ;"
1210+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
1211+ "adcx %%r11, %%r10 ;"
1212+ "adox 16(%1), %%r10 ;"
1213+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
1214+ "adcx %%rax, %%r11 ;"
1215+ "adox 24(%1), %%r11 ;"
1216+ /***************************************/
1217+ "adcx %%rbx, %%rcx ;"
1218+ "adox %%rbx, %%rcx ;"
1219+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1220+ "adcx %%rcx, %%r8 ;"
1221+ "adcx %%rbx, %%r9 ;"
1222+ "movq %%r9, 8(%0) ;"
1223+ "adcx %%rbx, %%r10 ;"
1224+ "movq %%r10, 16(%0) ;"
1225+ "adcx %%rbx, %%r11 ;"
1226+ "movq %%r11, 24(%0) ;"
1227+ "mov $0, %%ecx ;"
1228+ "cmovc %%edx, %%ecx ;"
1229+ "addq %%rcx, %%r8 ;"
1230+ "movq %%r8, (%0) ;"
1231+
1232+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
1233+ "xorl %%ebx, %%ebx ;"
1234+ "adox 64(%1), %%r8 ;"
1235+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
1236+ "adcx %%r10, %%r9 ;"
1237+ "adox 72(%1), %%r9 ;"
1238+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
1239+ "adcx %%r11, %%r10 ;"
1240+ "adox 80(%1), %%r10 ;"
1241+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
1242+ "adcx %%rax, %%r11 ;"
1243+ "adox 88(%1), %%r11 ;"
1244+ /****************************************/
1245+ "adcx %%rbx, %%rcx ;"
1246+ "adox %%rbx, %%rcx ;"
1247+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1248+ "adcx %%rcx, %%r8 ;"
1249+ "adcx %%rbx, %%r9 ;"
1250+ "movq %%r9, 40(%0) ;"
1251+ "adcx %%rbx, %%r10 ;"
1252+ "movq %%r10, 48(%0) ;"
1253+ "adcx %%rbx, %%r11 ;"
1254+ "movq %%r11, 56(%0) ;"
1255+ "mov $0, %%ecx ;"
1256+ "cmovc %%edx, %%ecx ;"
1257+ "addq %%rcx, %%r8 ;"
1258+ "movq %%r8, 32(%0) ;"
1259+ :
1260+ : "r"(c), "r"(a)
1261+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1262+ "%r10", "%r11");
1263+}
1264+
1265+static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
1266+{
1267+ asm volatile(
1268+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
1269+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1270+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1271+ "addq %%r10, %%r9 ;"
1272+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1273+ "adcq %%r11, %%r10 ;"
1274+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1275+ "adcq %%rax, %%r11 ;"
1276+ /***************************************/
1277+ "adcq $0, %%rcx ;"
1278+ "addq (%1), %%r8 ;"
1279+ "adcq 8(%1), %%r9 ;"
1280+ "adcq 16(%1), %%r10 ;"
1281+ "adcq 24(%1), %%r11 ;"
1282+ "adcq $0, %%rcx ;"
1283+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1284+ "addq %%rcx, %%r8 ;"
1285+ "adcq $0, %%r9 ;"
1286+ "movq %%r9, 8(%0) ;"
1287+ "adcq $0, %%r10 ;"
1288+ "movq %%r10, 16(%0) ;"
1289+ "adcq $0, %%r11 ;"
1290+ "movq %%r11, 24(%0) ;"
1291+ "mov $0, %%ecx ;"
1292+ "cmovc %%edx, %%ecx ;"
1293+ "addq %%rcx, %%r8 ;"
1294+ "movq %%r8, (%0) ;"
1295+
1296+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
1297+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
1298+ "addq %%r10, %%r9 ;"
1299+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
1300+ "adcq %%r11, %%r10 ;"
1301+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
1302+ "adcq %%rax, %%r11 ;"
1303+ /****************************************/
1304+ "adcq $0, %%rcx ;"
1305+ "addq 64(%1), %%r8 ;"
1306+ "adcq 72(%1), %%r9 ;"
1307+ "adcq 80(%1), %%r10 ;"
1308+ "adcq 88(%1), %%r11 ;"
1309+ "adcq $0, %%rcx ;"
1310+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1311+ "addq %%rcx, %%r8 ;"
1312+ "adcq $0, %%r9 ;"
1313+ "movq %%r9, 40(%0) ;"
1314+ "adcq $0, %%r10 ;"
1315+ "movq %%r10, 48(%0) ;"
1316+ "adcq $0, %%r11 ;"
1317+ "movq %%r11, 56(%0) ;"
1318+ "mov $0, %%ecx ;"
1319+ "cmovc %%edx, %%ecx ;"
1320+ "addq %%rcx, %%r8 ;"
1321+ "movq %%r8, 32(%0) ;"
1322+ :
1323+ : "r"(c), "r"(a)
1324+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1325+ "%r11");
1326+}
1327+
1328+static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
1329+ const u64 *const b)
1330+{
1331+ asm volatile(
1332+ "movq (%1), %%rdx; " /* A[0] */
1333+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
1334+ "xorl %%r10d, %%r10d ;"
1335+ "movq %%r8, (%0) ;"
1336+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
1337+ "adox %%r9, %%r10 ;"
1338+ "movq %%r10, 8(%0) ;"
1339+ "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
1340+ "adox %%r11, %%r15 ;"
1341+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
1342+ "adox %%r13, %%r14 ;"
1343+ "movq $0, %%rax ;"
1344+ /******************************************/
1345+ "adox %%rdx, %%rax ;"
1346+
1347+ "movq 8(%1), %%rdx; " /* A[1] */
1348+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
1349+ "xorl %%r10d, %%r10d ;"
1350+ "adcx 8(%0), %%r8 ;"
1351+ "movq %%r8, 8(%0) ;"
1352+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1353+ "adox %%r9, %%r10 ;"
1354+ "adcx %%r15, %%r10 ;"
1355+ "movq %%r10, 16(%0) ;"
1356+ "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
1357+ "adox %%r11, %%r15 ;"
1358+ "adcx %%r14, %%r15 ;"
1359+ "movq $0, %%r8 ;"
1360+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
1361+ "adox %%r13, %%r14 ;"
1362+ "adcx %%rax, %%r14 ;"
1363+ "movq $0, %%rax ;"
1364+ /******************************************/
1365+ "adox %%rdx, %%rax ;"
1366+ "adcx %%r8, %%rax ;"
1367+
1368+ "movq 16(%1), %%rdx; " /* A[2] */
1369+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
1370+ "xorl %%r10d, %%r10d ;"
1371+ "adcx 16(%0), %%r8 ;"
1372+ "movq %%r8, 16(%0) ;"
1373+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1374+ "adox %%r9, %%r10 ;"
1375+ "adcx %%r15, %%r10 ;"
1376+ "movq %%r10, 24(%0) ;"
1377+ "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
1378+ "adox %%r11, %%r15 ;"
1379+ "adcx %%r14, %%r15 ;"
1380+ "movq $0, %%r8 ;"
1381+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
1382+ "adox %%r13, %%r14 ;"
1383+ "adcx %%rax, %%r14 ;"
1384+ "movq $0, %%rax ;"
1385+ /******************************************/
1386+ "adox %%rdx, %%rax ;"
1387+ "adcx %%r8, %%rax ;"
1388+
1389+ "movq 24(%1), %%rdx; " /* A[3] */
1390+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
1391+ "xorl %%r10d, %%r10d ;"
1392+ "adcx 24(%0), %%r8 ;"
1393+ "movq %%r8, 24(%0) ;"
1394+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1395+ "adox %%r9, %%r10 ;"
1396+ "adcx %%r15, %%r10 ;"
1397+ "movq %%r10, 32(%0) ;"
1398+ "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
1399+ "adox %%r11, %%r15 ;"
1400+ "adcx %%r14, %%r15 ;"
1401+ "movq %%r15, 40(%0) ;"
1402+ "movq $0, %%r8 ;"
1403+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
1404+ "adox %%r13, %%r14 ;"
1405+ "adcx %%rax, %%r14 ;"
1406+ "movq %%r14, 48(%0) ;"
1407+ "movq $0, %%rax ;"
1408+ /******************************************/
1409+ "adox %%rdx, %%rax ;"
1410+ "adcx %%r8, %%rax ;"
1411+ "movq %%rax, 56(%0) ;"
1412+ :
1413+ : "r"(c), "r"(a), "r"(b)
1414+ : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
1415+ "%r13", "%r14", "%r15");
1416+}
1417+
1418+static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
1419+ const u64 *const b)
1420+{
1421+ asm volatile(
1422+ "movq (%1), %%rdx; " /* A[0] */
1423+ "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
1424+ "movq %%r8, (%0) ;"
1425+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
1426+ "addq %%r10, %%r15 ;"
1427+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
1428+ "adcq %%r8, %%rax ;"
1429+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
1430+ "adcq %%r10, %%rbx ;"
1431+ /******************************************/
1432+ "adcq $0, %%rcx ;"
1433+
1434+ "movq 8(%1), %%rdx; " /* A[1] */
1435+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
1436+ "addq %%r15, %%r8 ;"
1437+ "movq %%r8, 8(%0) ;"
1438+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1439+ "adcq %%r10, %%r9 ;"
1440+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
1441+ "adcq %%r8, %%r11 ;"
1442+ "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
1443+ "adcq %%r10, %%r13 ;"
1444+ /******************************************/
1445+ "adcq $0, %%r15 ;"
1446+
1447+ "addq %%r9, %%rax ;"
1448+ "adcq %%r11, %%rbx ;"
1449+ "adcq %%r13, %%rcx ;"
1450+ "adcq $0, %%r15 ;"
1451+
1452+ "movq 16(%1), %%rdx; " /* A[2] */
1453+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
1454+ "addq %%rax, %%r8 ;"
1455+ "movq %%r8, 16(%0) ;"
1456+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1457+ "adcq %%r10, %%r9 ;"
1458+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
1459+ "adcq %%r8, %%r11 ;"
1460+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
1461+ "adcq %%r10, %%r13 ;"
1462+ /******************************************/
1463+ "adcq $0, %%rax ;"
1464+
1465+ "addq %%r9, %%rbx ;"
1466+ "adcq %%r11, %%rcx ;"
1467+ "adcq %%r13, %%r15 ;"
1468+ "adcq $0, %%rax ;"
1469+
1470+ "movq 24(%1), %%rdx; " /* A[3] */
1471+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
1472+ "addq %%rbx, %%r8 ;"
1473+ "movq %%r8, 24(%0) ;"
1474+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1475+ "adcq %%r10, %%r9 ;"
1476+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
1477+ "adcq %%r8, %%r11 ;"
1478+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
1479+ "adcq %%r10, %%r13 ;"
1480+ /******************************************/
1481+ "adcq $0, %%rbx ;"
1482+
1483+ "addq %%r9, %%rcx ;"
1484+ "movq %%rcx, 32(%0) ;"
1485+ "adcq %%r11, %%r15 ;"
1486+ "movq %%r15, 40(%0) ;"
1487+ "adcq %%r13, %%rax ;"
1488+ "movq %%rax, 48(%0) ;"
1489+ "adcq $0, %%rbx ;"
1490+ "movq %%rbx, 56(%0) ;"
1491+ :
1492+ : "r"(c), "r"(a), "r"(b)
1493+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1494+ "%r10", "%r11", "%r13", "%r15");
1495+}
1496+
1497+static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
1498+{
1499+ asm volatile(
1500+ "movq (%1), %%rdx ;" /* A[0] */
1501+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
1502+ "xorl %%r15d, %%r15d;"
1503+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
1504+ "adcx %%r14, %%r9 ;"
1505+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
1506+ "adcx %%rax, %%r10 ;"
1507+ "movq 24(%1), %%rdx ;" /* A[3] */
1508+ "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
1509+ "adcx %%rcx, %%r11 ;"
1510+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
1511+ "adcx %%rax, %%rbx ;"
1512+ "movq 8(%1), %%rdx ;" /* A[1] */
1513+ "adcx %%r15, %%r13 ;"
1514+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
1515+ "movq $0, %%r14 ;"
1516+ /******************************************/
1517+ "adcx %%r15, %%r14 ;"
1518+
1519+ "xorl %%r15d, %%r15d;"
1520+ "adox %%rax, %%r10 ;"
1521+ "adcx %%r8, %%r8 ;"
1522+ "adox %%rcx, %%r11 ;"
1523+ "adcx %%r9, %%r9 ;"
1524+ "adox %%r15, %%rbx ;"
1525+ "adcx %%r10, %%r10 ;"
1526+ "adox %%r15, %%r13 ;"
1527+ "adcx %%r11, %%r11 ;"
1528+ "adox %%r15, %%r14 ;"
1529+ "adcx %%rbx, %%rbx ;"
1530+ "adcx %%r13, %%r13 ;"
1531+ "adcx %%r14, %%r14 ;"
1532+
1533+ "movq (%1), %%rdx ;"
1534+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1535+ /*******************/
1536+ "movq %%rax, 0(%0) ;"
1537+ "addq %%rcx, %%r8 ;"
1538+ "movq %%r8, 8(%0) ;"
1539+ "movq 8(%1), %%rdx ;"
1540+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1541+ "adcq %%rax, %%r9 ;"
1542+ "movq %%r9, 16(%0) ;"
1543+ "adcq %%rcx, %%r10 ;"
1544+ "movq %%r10, 24(%0) ;"
1545+ "movq 16(%1), %%rdx ;"
1546+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1547+ "adcq %%rax, %%r11 ;"
1548+ "movq %%r11, 32(%0) ;"
1549+ "adcq %%rcx, %%rbx ;"
1550+ "movq %%rbx, 40(%0) ;"
1551+ "movq 24(%1), %%rdx ;"
1552+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1553+ "adcq %%rax, %%r13 ;"
1554+ "movq %%r13, 48(%0) ;"
1555+ "adcq %%rcx, %%r14 ;"
1556+ "movq %%r14, 56(%0) ;"
1557+ :
1558+ : "r"(c), "r"(a)
1559+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1560+ "%r10", "%r11", "%r13", "%r14", "%r15");
1561+}
1562+
1563+static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1564+{
1565+ asm volatile(
1566+ "movq 8(%1), %%rdx ;" /* A[1] */
1567+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
1568+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1569+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1570+
1571+ "movq 16(%1), %%rdx ;" /* A[2] */
1572+ "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1573+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1574+
1575+ "addq %%rax, %%r9 ;"
1576+ "adcq %%rdx, %%r10 ;"
1577+ "adcq %%rcx, %%r11 ;"
1578+ "adcq %%r14, %%r15 ;"
1579+ "adcq $0, %%r13 ;"
1580+ "movq $0, %%r14 ;"
1581+ "adcq $0, %%r14 ;"
1582+
1583+ "movq (%1), %%rdx ;" /* A[0] */
1584+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1585+
1586+ "addq %%rax, %%r10 ;"
1587+ "adcq %%rcx, %%r11 ;"
1588+ "adcq $0, %%r15 ;"
1589+ "adcq $0, %%r13 ;"
1590+ "adcq $0, %%r14 ;"
1591+
1592+ "shldq $1, %%r13, %%r14 ;"
1593+ "shldq $1, %%r15, %%r13 ;"
1594+ "shldq $1, %%r11, %%r15 ;"
1595+ "shldq $1, %%r10, %%r11 ;"
1596+ "shldq $1, %%r9, %%r10 ;"
1597+ "shldq $1, %%r8, %%r9 ;"
1598+ "shlq $1, %%r8 ;"
1599+
1600+ /*******************/
1601+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1602+ /*******************/
1603+ "movq %%rax, 0(%0) ;"
1604+ "addq %%rcx, %%r8 ;"
1605+ "movq %%r8, 8(%0) ;"
1606+ "movq 8(%1), %%rdx ;"
1607+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1608+ "adcq %%rax, %%r9 ;"
1609+ "movq %%r9, 16(%0) ;"
1610+ "adcq %%rcx, %%r10 ;"
1611+ "movq %%r10, 24(%0) ;"
1612+ "movq 16(%1), %%rdx ;"
1613+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1614+ "adcq %%rax, %%r11 ;"
1615+ "movq %%r11, 32(%0) ;"
1616+ "adcq %%rcx, %%r15 ;"
1617+ "movq %%r15, 40(%0) ;"
1618+ "movq 24(%1), %%rdx ;"
1619+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1620+ "adcq %%rax, %%r13 ;"
1621+ "movq %%r13, 48(%0) ;"
1622+ "adcq %%rcx, %%r14 ;"
1623+ "movq %%r14, 56(%0) ;"
1624+ :
1625+ : "r"(c), "r"(a)
1626+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1627+ "%r11", "%r13", "%r14", "%r15");
1628+}
1629+
1630+static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1631+{
1632+ asm volatile(
1633+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
1634+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1635+ "xorl %%ebx, %%ebx ;"
1636+ "adox (%1), %%r8 ;"
1637+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1638+ "adcx %%r10, %%r9 ;"
1639+ "adox 8(%1), %%r9 ;"
1640+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1641+ "adcx %%r11, %%r10 ;"
1642+ "adox 16(%1), %%r10 ;"
1643+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1644+ "adcx %%rax, %%r11 ;"
1645+ "adox 24(%1), %%r11 ;"
1646+ /***************************************/
1647+ "adcx %%rbx, %%rcx ;"
1648+ "adox %%rbx, %%rcx ;"
1649+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1650+ "adcx %%rcx, %%r8 ;"
1651+ "adcx %%rbx, %%r9 ;"
1652+ "movq %%r9, 8(%0) ;"
1653+ "adcx %%rbx, %%r10 ;"
1654+ "movq %%r10, 16(%0) ;"
1655+ "adcx %%rbx, %%r11 ;"
1656+ "movq %%r11, 24(%0) ;"
1657+ "mov $0, %%ecx ;"
1658+ "cmovc %%edx, %%ecx ;"
1659+ "addq %%rcx, %%r8 ;"
1660+ "movq %%r8, (%0) ;"
1661+ :
1662+ : "r"(c), "r"(a)
1663+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1664+ "%r10", "%r11");
1665+}
1666+
1667+static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1668+{
1669+ asm volatile(
1670+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
1671+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
1672+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
1673+ "addq %%r10, %%r9 ;"
1674+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1675+ "adcq %%r11, %%r10 ;"
1676+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1677+ "adcq %%rax, %%r11 ;"
1678+ /***************************************/
1679+ "adcq $0, %%rcx ;"
1680+ "addq (%1), %%r8 ;"
1681+ "adcq 8(%1), %%r9 ;"
1682+ "adcq 16(%1), %%r10 ;"
1683+ "adcq 24(%1), %%r11 ;"
1684+ "adcq $0, %%rcx ;"
1685+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1686+ "addq %%rcx, %%r8 ;"
1687+ "adcq $0, %%r9 ;"
1688+ "movq %%r9, 8(%0) ;"
1689+ "adcq $0, %%r10 ;"
1690+ "movq %%r10, 16(%0) ;"
1691+ "adcq $0, %%r11 ;"
1692+ "movq %%r11, 24(%0) ;"
1693+ "mov $0, %%ecx ;"
1694+ "cmovc %%edx, %%ecx ;"
1695+ "addq %%rcx, %%r8 ;"
1696+ "movq %%r8, (%0) ;"
1697+ :
1698+ : "r"(c), "r"(a)
1699+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1700+ "%r11");
1701+}
1702+
1703+static __always_inline void
1704+add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
1705+{
1706+ asm volatile(
1707+ "mov $38, %%eax ;"
1708+ "xorl %%ecx, %%ecx ;"
1709+ "movq (%2), %%r8 ;"
1710+ "adcx (%1), %%r8 ;"
1711+ "movq 8(%2), %%r9 ;"
1712+ "adcx 8(%1), %%r9 ;"
1713+ "movq 16(%2), %%r10 ;"
1714+ "adcx 16(%1), %%r10 ;"
1715+ "movq 24(%2), %%r11 ;"
1716+ "adcx 24(%1), %%r11 ;"
1717+ "cmovc %%eax, %%ecx ;"
1718+ "xorl %%eax, %%eax ;"
1719+ "adcx %%rcx, %%r8 ;"
1720+ "adcx %%rax, %%r9 ;"
1721+ "movq %%r9, 8(%0) ;"
1722+ "adcx %%rax, %%r10 ;"
1723+ "movq %%r10, 16(%0) ;"
1724+ "adcx %%rax, %%r11 ;"
1725+ "movq %%r11, 24(%0) ;"
1726+ "mov $38, %%ecx ;"
1727+ "cmovc %%ecx, %%eax ;"
1728+ "addq %%rax, %%r8 ;"
1729+ "movq %%r8, (%0) ;"
1730+ :
1731+ : "r"(c), "r"(a), "r"(b)
1732+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1733+}
1734+
1735+static __always_inline void
1736+add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
1737+{
1738+ asm volatile(
1739+ "mov $38, %%eax ;"
1740+ "movq (%2), %%r8 ;"
1741+ "addq (%1), %%r8 ;"
1742+ "movq 8(%2), %%r9 ;"
1743+ "adcq 8(%1), %%r9 ;"
1744+ "movq 16(%2), %%r10 ;"
1745+ "adcq 16(%1), %%r10 ;"
1746+ "movq 24(%2), %%r11 ;"
1747+ "adcq 24(%1), %%r11 ;"
1748+ "mov $0, %%ecx ;"
1749+ "cmovc %%eax, %%ecx ;"
1750+ "addq %%rcx, %%r8 ;"
1751+ "adcq $0, %%r9 ;"
1752+ "movq %%r9, 8(%0) ;"
1753+ "adcq $0, %%r10 ;"
1754+ "movq %%r10, 16(%0) ;"
1755+ "adcq $0, %%r11 ;"
1756+ "movq %%r11, 24(%0) ;"
1757+ "mov $0, %%ecx ;"
1758+ "cmovc %%eax, %%ecx ;"
1759+ "addq %%rcx, %%r8 ;"
1760+ "movq %%r8, (%0) ;"
1761+ :
1762+ : "r"(c), "r"(a), "r"(b)
1763+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1764+}
1765+
1766+static __always_inline void
1767+sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
1768+{
1769+ asm volatile(
1770+ "mov $38, %%eax ;"
1771+ "movq (%1), %%r8 ;"
1772+ "subq (%2), %%r8 ;"
1773+ "movq 8(%1), %%r9 ;"
1774+ "sbbq 8(%2), %%r9 ;"
1775+ "movq 16(%1), %%r10 ;"
1776+ "sbbq 16(%2), %%r10 ;"
1777+ "movq 24(%1), %%r11 ;"
1778+ "sbbq 24(%2), %%r11 ;"
1779+ "mov $0, %%ecx ;"
1780+ "cmovc %%eax, %%ecx ;"
1781+ "subq %%rcx, %%r8 ;"
1782+ "sbbq $0, %%r9 ;"
1783+ "movq %%r9, 8(%0) ;"
1784+ "sbbq $0, %%r10 ;"
1785+ "movq %%r10, 16(%0) ;"
1786+ "sbbq $0, %%r11 ;"
1787+ "movq %%r11, 24(%0) ;"
1788+ "mov $0, %%ecx ;"
1789+ "cmovc %%eax, %%ecx ;"
1790+ "subq %%rcx, %%r8 ;"
1791+ "movq %%r8, (%0) ;"
1792+ :
1793+ : "r"(c), "r"(a), "r"(b)
1794+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1795+}
1796+
1797+/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
1798+static __always_inline void
1799+mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
1800+{
1801+ const u64 a24 = 121666;
1802+ asm volatile(
1803+ "movq %2, %%rdx ;"
1804+ "mulx (%1), %%r8, %%r10 ;"
1805+ "mulx 8(%1), %%r9, %%r11 ;"
1806+ "addq %%r10, %%r9 ;"
1807+ "mulx 16(%1), %%r10, %%rax ;"
1808+ "adcq %%r11, %%r10 ;"
1809+ "mulx 24(%1), %%r11, %%rcx ;"
1810+ "adcq %%rax, %%r11 ;"
1811+ /**************************/
1812+ "adcq $0, %%rcx ;"
1813+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
1814+ "imul %%rdx, %%rcx ;"
1815+ "addq %%rcx, %%r8 ;"
1816+ "adcq $0, %%r9 ;"
1817+ "movq %%r9, 8(%0) ;"
1818+ "adcq $0, %%r10 ;"
1819+ "movq %%r10, 16(%0) ;"
1820+ "adcq $0, %%r11 ;"
1821+ "movq %%r11, 24(%0) ;"
1822+ "mov $0, %%ecx ;"
1823+ "cmovc %%edx, %%ecx ;"
1824+ "addq %%rcx, %%r8 ;"
1825+ "movq %%r8, (%0) ;"
1826+ :
1827+ : "r"(c), "r"(a), "r"(a24)
1828+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1829+ "%r11");
1830+}
1831+
1832+static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1833+{
1834+ struct {
1835+ eltfp25519_1w_buffer buffer;
1836+ eltfp25519_1w x0, x1, x2;
1837+ } __aligned(32) m;
1838+ u64 *T[4];
1839+
1840+ T[0] = m.x0;
1841+ T[1] = c; /* x^(-1) */
1842+ T[2] = m.x1;
1843+ T[3] = m.x2;
1844+
1845+ copy_eltfp25519_1w(T[1], a);
1846+ sqrn_eltfp25519_1w_adx(T[1], 1);
1847+ copy_eltfp25519_1w(T[2], T[1]);
1848+ sqrn_eltfp25519_1w_adx(T[2], 2);
1849+ mul_eltfp25519_1w_adx(T[0], a, T[2]);
1850+ mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
1851+ copy_eltfp25519_1w(T[2], T[1]);
1852+ sqrn_eltfp25519_1w_adx(T[2], 1);
1853+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1854+ copy_eltfp25519_1w(T[2], T[0]);
1855+ sqrn_eltfp25519_1w_adx(T[2], 5);
1856+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1857+ copy_eltfp25519_1w(T[2], T[0]);
1858+ sqrn_eltfp25519_1w_adx(T[2], 10);
1859+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1860+ copy_eltfp25519_1w(T[3], T[2]);
1861+ sqrn_eltfp25519_1w_adx(T[3], 20);
1862+ mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
1863+ sqrn_eltfp25519_1w_adx(T[3], 10);
1864+ mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
1865+ copy_eltfp25519_1w(T[0], T[3]);
1866+ sqrn_eltfp25519_1w_adx(T[0], 50);
1867+ mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
1868+ copy_eltfp25519_1w(T[2], T[0]);
1869+ sqrn_eltfp25519_1w_adx(T[2], 100);
1870+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1871+ sqrn_eltfp25519_1w_adx(T[2], 50);
1872+ mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
1873+ sqrn_eltfp25519_1w_adx(T[2], 5);
1874+ mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
1875+
1876+ memzero_explicit(&m, sizeof(m));
1877+}
1878+
1879+static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1880+{
1881+ struct {
1882+ eltfp25519_1w_buffer buffer;
1883+ eltfp25519_1w x0, x1, x2;
1884+ } __aligned(32) m;
1885+ u64 *T[5];
1886+
1887+ T[0] = m.x0;
1888+ T[1] = c; /* x^(-1) */
1889+ T[2] = m.x1;
1890+ T[3] = m.x2;
1891+
1892+ copy_eltfp25519_1w(T[1], a);
1893+ sqrn_eltfp25519_1w_bmi2(T[1], 1);
1894+ copy_eltfp25519_1w(T[2], T[1]);
1895+ sqrn_eltfp25519_1w_bmi2(T[2], 2);
1896+ mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
1897+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
1898+ copy_eltfp25519_1w(T[2], T[1]);
1899+ sqrn_eltfp25519_1w_bmi2(T[2], 1);
1900+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1901+ copy_eltfp25519_1w(T[2], T[0]);
1902+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
1903+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1904+ copy_eltfp25519_1w(T[2], T[0]);
1905+ sqrn_eltfp25519_1w_bmi2(T[2], 10);
1906+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1907+ copy_eltfp25519_1w(T[3], T[2]);
1908+ sqrn_eltfp25519_1w_bmi2(T[3], 20);
1909+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
1910+ sqrn_eltfp25519_1w_bmi2(T[3], 10);
1911+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
1912+ copy_eltfp25519_1w(T[0], T[3]);
1913+ sqrn_eltfp25519_1w_bmi2(T[0], 50);
1914+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
1915+ copy_eltfp25519_1w(T[2], T[0]);
1916+ sqrn_eltfp25519_1w_bmi2(T[2], 100);
1917+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1918+ sqrn_eltfp25519_1w_bmi2(T[2], 50);
1919+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
1920+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
1921+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
1922+
1923+ memzero_explicit(&m, sizeof(m));
1924+}
1925+
1926+/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
1927+ * with a number such that 0 <= C < 2**255-19.
1928+ */
1929+static __always_inline void fred_eltfp25519_1w(u64 *const c)
1930+{
1931+ u64 tmp0 = 38, tmp1 = 19;
1932+ asm volatile(
1933+ "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */
1934+ "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */
1935+
1936+ /* Add either 19 or 38 to c */
1937+ "addq %4, %0 ;"
1938+ "adcq $0, %1 ;"
1939+ "adcq $0, %2 ;"
1940+ "adcq $0, %3 ;"
1941+
1942+ /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
1943+ "movl $0, %k4 ;"
1944+ "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */
1945+ "btrq $63, %3 ;" /* Clear bit 255 */
1946+
1947+ /* Subtract 19 if necessary */
1948+ "subq %4, %0 ;"
1949+ "sbbq $0, %1 ;"
1950+ "sbbq $0, %2 ;"
1951+ "sbbq $0, %3 ;"
1952+
1953+ : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
1954+ "+r"(tmp1)
1955+ :
1956+ : "memory", "cc");
1957+}
1958+
1959+static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
1960+{
1961+ u64 temp;
1962+ asm volatile(
1963+ "test %9, %9 ;"
1964+ "movq %0, %8 ;"
1965+ "cmovnzq %4, %0 ;"
1966+ "cmovnzq %8, %4 ;"
1967+ "movq %1, %8 ;"
1968+ "cmovnzq %5, %1 ;"
1969+ "cmovnzq %8, %5 ;"
1970+ "movq %2, %8 ;"
1971+ "cmovnzq %6, %2 ;"
1972+ "cmovnzq %8, %6 ;"
1973+ "movq %3, %8 ;"
1974+ "cmovnzq %7, %3 ;"
1975+ "cmovnzq %8, %7 ;"
1976+ : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
1977+ "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
1978+ "=r"(temp)
1979+ : "r"(bit)
1980+ : "cc"
1981+ );
1982+}
1983+
1984+static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
1985+{
1986+ asm volatile(
1987+ "test %4, %4 ;"
1988+ "cmovnzq %5, %0 ;"
1989+ "cmovnzq %6, %1 ;"
1990+ "cmovnzq %7, %2 ;"
1991+ "cmovnzq %8, %3 ;"
1992+ : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
1993+ : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
1994+ : "cc"
1995+ );
1996+}
1997+
1998+static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
1999+ const u8 private_key[CURVE25519_KEY_SIZE],
2000+ const u8 session_key[CURVE25519_KEY_SIZE])
2001+{
2002+ struct {
2003+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
2004+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2005+ u64 workspace[6 * NUM_WORDS_ELTFP25519];
2006+ u8 session[CURVE25519_KEY_SIZE];
2007+ u8 private[CURVE25519_KEY_SIZE];
2008+ } __aligned(32) m;
2009+
2010+ int i = 0, j = 0;
2011+ u64 prev = 0;
2012+ u64 *const X1 = (u64 *)m.session;
2013+ u64 *const key = (u64 *)m.private;
2014+ u64 *const Px = m.coordinates + 0;
2015+ u64 *const Pz = m.coordinates + 4;
2016+ u64 *const Qx = m.coordinates + 8;
2017+ u64 *const Qz = m.coordinates + 12;
2018+ u64 *const X2 = Qx;
2019+ u64 *const Z2 = Qz;
2020+ u64 *const X3 = Px;
2021+ u64 *const Z3 = Pz;
2022+ u64 *const X2Z2 = Qx;
2023+ u64 *const X3Z3 = Px;
2024+
2025+ u64 *const A = m.workspace + 0;
2026+ u64 *const B = m.workspace + 4;
2027+ u64 *const D = m.workspace + 8;
2028+ u64 *const C = m.workspace + 12;
2029+ u64 *const DA = m.workspace + 16;
2030+ u64 *const CB = m.workspace + 20;
2031+ u64 *const AB = A;
2032+ u64 *const DC = D;
2033+ u64 *const DACB = DA;
2034+
2035+ memcpy(m.private, private_key, sizeof(m.private));
2036+ memcpy(m.session, session_key, sizeof(m.session));
2037+
2038+ curve25519_clamp_secret(m.private);
2039+
2040+ /* As in the draft:
2041+ * When receiving such an array, implementations of curve25519
2042+ * MUST mask the most-significant bit in the final byte. This
2043+ * is done to preserve compatibility with point formats which
2044+ * reserve the sign bit for use in other protocols and to
2045+ * increase resistance to implementation fingerprinting
2046+ */
2047+ m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2048+
2049+ copy_eltfp25519_1w(Px, X1);
2050+ setzero_eltfp25519_1w(Pz);
2051+ setzero_eltfp25519_1w(Qx);
2052+ setzero_eltfp25519_1w(Qz);
2053+
2054+ Pz[0] = 1;
2055+ Qx[0] = 1;
2056+
2057+ /* main-loop */
2058+ prev = 0;
2059+ j = 62;
2060+ for (i = 3; i >= 0; --i) {
2061+ while (j >= 0) {
2062+ u64 bit = (key[i] >> j) & 0x1;
2063+ u64 swap = bit ^ prev;
2064+ prev = bit;
2065+
2066+ add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
2067+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
2068+ add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
2069+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
2070+ mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
2071+
2072+ cselect(swap, A, C);
2073+ cselect(swap, B, D);
2074+
2075+ sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
2076+ add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
2077+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
2078+ sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2079+
2080+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
2081+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
2082+
2083+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
2084+ add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
2085+ mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2086+ mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
2087+ --j;
2088+ }
2089+ j = 63;
2090+ }
2091+
2092+ inv_eltfp25519_1w_adx(A, Qz);
2093+ mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
2094+ fred_eltfp25519_1w((u64 *)shared);
2095+
2096+ memzero_explicit(&m, sizeof(m));
2097+}
2098+
2099+static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
2100+ const u8 private_key[CURVE25519_KEY_SIZE])
2101+{
2102+ struct {
2103+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
2104+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2105+ u64 workspace[4 * NUM_WORDS_ELTFP25519];
2106+ u8 private[CURVE25519_KEY_SIZE];
2107+ } __aligned(32) m;
2108+
2109+ const int ite[4] = { 64, 64, 64, 63 };
2110+ const int q = 3;
2111+ u64 swap = 1;
2112+
2113+ int i = 0, j = 0, k = 0;
2114+ u64 *const key = (u64 *)m.private;
2115+ u64 *const Ur1 = m.coordinates + 0;
2116+ u64 *const Zr1 = m.coordinates + 4;
2117+ u64 *const Ur2 = m.coordinates + 8;
2118+ u64 *const Zr2 = m.coordinates + 12;
2119+
2120+ u64 *const UZr1 = m.coordinates + 0;
2121+ u64 *const ZUr2 = m.coordinates + 8;
2122+
2123+ u64 *const A = m.workspace + 0;
2124+ u64 *const B = m.workspace + 4;
2125+ u64 *const C = m.workspace + 8;
2126+ u64 *const D = m.workspace + 12;
2127+
2128+ u64 *const AB = m.workspace + 0;
2129+ u64 *const CD = m.workspace + 8;
2130+
2131+ const u64 *const P = table_ladder_8k;
2132+
2133+ memcpy(m.private, private_key, sizeof(m.private));
2134+
2135+ curve25519_clamp_secret(m.private);
2136+
2137+ setzero_eltfp25519_1w(Ur1);
2138+ setzero_eltfp25519_1w(Zr1);
2139+ setzero_eltfp25519_1w(Zr2);
2140+ Ur1[0] = 1;
2141+ Zr1[0] = 1;
2142+ Zr2[0] = 1;
2143+
2144+ /* G-S */
2145+ Ur2[3] = 0x1eaecdeee27cab34UL;
2146+ Ur2[2] = 0xadc7a0b9235d48e2UL;
2147+ Ur2[1] = 0xbbf095ae14b2edf8UL;
2148+ Ur2[0] = 0x7e94e1fec82faabdUL;
2149+
2150+ /* main-loop */
2151+ j = q;
2152+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2153+ while (j < ite[i]) {
2154+ u64 bit = (key[i] >> j) & 0x1;
2155+ k = (64 * i + j - q);
2156+ swap = swap ^ bit;
2157+ cswap(swap, Ur1, Ur2);
2158+ cswap(swap, Zr1, Zr2);
2159+ swap = bit;
2160+ /* Addition */
2161+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2162+ add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2163+ mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
2164+ sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2165+ add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2166+ sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
2167+ mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
2168+ ++j;
2169+ }
2170+ j = 0;
2171+ }
2172+
2173+ /* Doubling */
2174+ for (i = 0; i < q; ++i) {
2175+ add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2176+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2177+ sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
2178+ copy_eltfp25519_1w(C, B); /* C = B */
2179+ sub_eltfp25519_1w(B, A, B); /* B = A-B */
2180+ mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
2181+ add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
2182+ mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
2183+ }
2184+
2185+ /* Convert to affine coordinates */
2186+ inv_eltfp25519_1w_adx(A, Zr1);
2187+ mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
2188+ fred_eltfp25519_1w((u64 *)session_key);
2189+
2190+ memzero_explicit(&m, sizeof(m));
2191+}
2192+
2193+static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
2194+ const u8 private_key[CURVE25519_KEY_SIZE],
2195+ const u8 session_key[CURVE25519_KEY_SIZE])
2196+{
2197+ struct {
2198+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
2199+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2200+ u64 workspace[6 * NUM_WORDS_ELTFP25519];
2201+ u8 session[CURVE25519_KEY_SIZE];
2202+ u8 private[CURVE25519_KEY_SIZE];
2203+ } __aligned(32) m;
2204+
2205+ int i = 0, j = 0;
2206+ u64 prev = 0;
2207+ u64 *const X1 = (u64 *)m.session;
2208+ u64 *const key = (u64 *)m.private;
2209+ u64 *const Px = m.coordinates + 0;
2210+ u64 *const Pz = m.coordinates + 4;
2211+ u64 *const Qx = m.coordinates + 8;
2212+ u64 *const Qz = m.coordinates + 12;
2213+ u64 *const X2 = Qx;
2214+ u64 *const Z2 = Qz;
2215+ u64 *const X3 = Px;
2216+ u64 *const Z3 = Pz;
2217+ u64 *const X2Z2 = Qx;
2218+ u64 *const X3Z3 = Px;
2219+
2220+ u64 *const A = m.workspace + 0;
2221+ u64 *const B = m.workspace + 4;
2222+ u64 *const D = m.workspace + 8;
2223+ u64 *const C = m.workspace + 12;
2224+ u64 *const DA = m.workspace + 16;
2225+ u64 *const CB = m.workspace + 20;
2226+ u64 *const AB = A;
2227+ u64 *const DC = D;
2228+ u64 *const DACB = DA;
2229+
2230+ memcpy(m.private, private_key, sizeof(m.private));
2231+ memcpy(m.session, session_key, sizeof(m.session));
2232+
2233+ curve25519_clamp_secret(m.private);
2234+
2235+ /* As in the draft:
2236+ * When receiving such an array, implementations of curve25519
2237+ * MUST mask the most-significant bit in the final byte. This
2238+ * is done to preserve compatibility with point formats which
2239+ * reserve the sign bit for use in other protocols and to
2240+ * increase resistance to implementation fingerprinting
2241+ */
2242+ m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2243+
2244+ copy_eltfp25519_1w(Px, X1);
2245+ setzero_eltfp25519_1w(Pz);
2246+ setzero_eltfp25519_1w(Qx);
2247+ setzero_eltfp25519_1w(Qz);
2248+
2249+ Pz[0] = 1;
2250+ Qx[0] = 1;
2251+
2252+ /* main-loop */
2253+ prev = 0;
2254+ j = 62;
2255+ for (i = 3; i >= 0; --i) {
2256+ while (j >= 0) {
2257+ u64 bit = (key[i] >> j) & 0x1;
2258+ u64 swap = bit ^ prev;
2259+ prev = bit;
2260+
2261+ add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
2262+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
2263+ add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
2264+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
2265+ mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
2266+
2267+ cselect(swap, A, C);
2268+ cselect(swap, B, D);
2269+
2270+ sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
2271+ add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
2272+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
2273+ sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2274+
2275+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
2276+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
2277+
2278+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
2279+ add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
2280+ mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2281+ mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
2282+ --j;
2283+ }
2284+ j = 63;
2285+ }
2286+
2287+ inv_eltfp25519_1w_bmi2(A, Qz);
2288+ mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
2289+ fred_eltfp25519_1w((u64 *)shared);
2290+
2291+ memzero_explicit(&m, sizeof(m));
2292+}
2293+
2294+static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
2295+ const u8 private_key[CURVE25519_KEY_SIZE])
2296+{
2297+ struct {
2298+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
2299+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2300+ u64 workspace[4 * NUM_WORDS_ELTFP25519];
2301+ u8 private[CURVE25519_KEY_SIZE];
2302+ } __aligned(32) m;
2303+
2304+ const int ite[4] = { 64, 64, 64, 63 };
2305+ const int q = 3;
2306+ u64 swap = 1;
2307+
2308+ int i = 0, j = 0, k = 0;
2309+ u64 *const key = (u64 *)m.private;
2310+ u64 *const Ur1 = m.coordinates + 0;
2311+ u64 *const Zr1 = m.coordinates + 4;
2312+ u64 *const Ur2 = m.coordinates + 8;
2313+ u64 *const Zr2 = m.coordinates + 12;
2314+
2315+ u64 *const UZr1 = m.coordinates + 0;
2316+ u64 *const ZUr2 = m.coordinates + 8;
2317+
2318+ u64 *const A = m.workspace + 0;
2319+ u64 *const B = m.workspace + 4;
2320+ u64 *const C = m.workspace + 8;
2321+ u64 *const D = m.workspace + 12;
2322+
2323+ u64 *const AB = m.workspace + 0;
2324+ u64 *const CD = m.workspace + 8;
2325+
2326+ const u64 *const P = table_ladder_8k;
2327+
2328+ memcpy(m.private, private_key, sizeof(m.private));
2329+
2330+ curve25519_clamp_secret(m.private);
2331+
2332+ setzero_eltfp25519_1w(Ur1);
2333+ setzero_eltfp25519_1w(Zr1);
2334+ setzero_eltfp25519_1w(Zr2);
2335+ Ur1[0] = 1;
2336+ Zr1[0] = 1;
2337+ Zr2[0] = 1;
2338+
2339+ /* G-S */
2340+ Ur2[3] = 0x1eaecdeee27cab34UL;
2341+ Ur2[2] = 0xadc7a0b9235d48e2UL;
2342+ Ur2[1] = 0xbbf095ae14b2edf8UL;
2343+ Ur2[0] = 0x7e94e1fec82faabdUL;
2344+
2345+ /* main-loop */
2346+ j = q;
2347+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2348+ while (j < ite[i]) {
2349+ u64 bit = (key[i] >> j) & 0x1;
2350+ k = (64 * i + j - q);
2351+ swap = swap ^ bit;
2352+ cswap(swap, Ur1, Ur2);
2353+ cswap(swap, Zr1, Zr2);
2354+ swap = bit;
2355+ /* Addition */
2356+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2357+ add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2358+ mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
2359+ sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2360+ add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2361+ sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
2362+ mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
2363+ ++j;
2364+ }
2365+ j = 0;
2366+ }
2367+
2368+ /* Doubling */
2369+ for (i = 0; i < q; ++i) {
2370+ add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
2371+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
2372+ sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
2373+ copy_eltfp25519_1w(C, B); /* C = B */
2374+ sub_eltfp25519_1w(B, A, B); /* B = A-B */
2375+ mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
2376+ add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
2377+ mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
2378+ }
2379+
2380+ /* Convert to affine coordinates */
2381+ inv_eltfp25519_1w_bmi2(A, Zr1);
2382+ mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
2383+ fred_eltfp25519_1w((u64 *)session_key);
2384+
2385+ memzero_explicit(&m, sizeof(m));
2386+}
2387+
2388+void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
2389+ const u8 secret[CURVE25519_KEY_SIZE],
2390+ const u8 basepoint[CURVE25519_KEY_SIZE])
2391+{
2392+ if (static_branch_likely(&curve25519_use_adx))
2393+ curve25519_adx(mypublic, secret, basepoint);
2394+ else if (static_branch_likely(&curve25519_use_bmi2))
2395+ curve25519_bmi2(mypublic, secret, basepoint);
2396+ else
2397+ curve25519_generic(mypublic, secret, basepoint);
2398+}
2399+EXPORT_SYMBOL(curve25519_arch);
2400+
2401+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
2402+ const u8 secret[CURVE25519_KEY_SIZE])
2403+{
2404+ if (static_branch_likely(&curve25519_use_adx))
2405+ curve25519_adx_base(pub, secret);
2406+ else if (static_branch_likely(&curve25519_use_bmi2))
2407+ curve25519_bmi2_base(pub, secret);
2408+ else
2409+ curve25519_generic(pub, secret, curve25519_base_point);
2410+}
2411+EXPORT_SYMBOL(curve25519_base_arch);
2412+
2413+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
2414+ unsigned int len)
2415+{
2416+ u8 *secret = kpp_tfm_ctx(tfm);
2417+
2418+ if (!len)
2419+ curve25519_generate_secret(secret);
2420+ else if (len == CURVE25519_KEY_SIZE &&
2421+ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
2422+ memcpy(secret, buf, CURVE25519_KEY_SIZE);
2423+ else
2424+ return -EINVAL;
2425+ return 0;
2426+}
2427+
2428+static int curve25519_generate_public_key(struct kpp_request *req)
2429+{
2430+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2431+ const u8 *secret = kpp_tfm_ctx(tfm);
2432+ u8 buf[CURVE25519_KEY_SIZE];
2433+ int copied, nbytes;
2434+
2435+ if (req->src)
2436+ return -EINVAL;
2437+
2438+ curve25519_base_arch(buf, secret);
2439+
2440+ /* might want less than we've got */
2441+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2442+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2443+ nbytes),
2444+ buf, nbytes);
2445+ if (copied != nbytes)
2446+ return -EINVAL;
2447+ return 0;
2448+}
2449+
2450+static int curve25519_compute_shared_secret(struct kpp_request *req)
2451+{
2452+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2453+ const u8 *secret = kpp_tfm_ctx(tfm);
2454+ u8 public_key[CURVE25519_KEY_SIZE];
2455+ u8 buf[CURVE25519_KEY_SIZE];
2456+ int copied, nbytes;
2457+
2458+ if (!req->src)
2459+ return -EINVAL;
2460+
2461+ copied = sg_copy_to_buffer(req->src,
2462+ sg_nents_for_len(req->src,
2463+ CURVE25519_KEY_SIZE),
2464+ public_key, CURVE25519_KEY_SIZE);
2465+ if (copied != CURVE25519_KEY_SIZE)
2466+ return -EINVAL;
2467+
2468+ curve25519_arch(buf, secret, public_key);
2469+
2470+ /* might want less than we've got */
2471+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2472+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2473+ nbytes),
2474+ buf, nbytes);
2475+ if (copied != nbytes)
2476+ return -EINVAL;
2477+ return 0;
2478+}
2479+
2480+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
2481+{
2482+ return CURVE25519_KEY_SIZE;
2483+}
2484+
2485+static struct kpp_alg curve25519_alg = {
2486+ .base.cra_name = "curve25519",
2487+ .base.cra_driver_name = "curve25519-x86",
2488+ .base.cra_priority = 200,
2489+ .base.cra_module = THIS_MODULE,
2490+ .base.cra_ctxsize = CURVE25519_KEY_SIZE,
2491+
2492+ .set_secret = curve25519_set_secret,
2493+ .generate_public_key = curve25519_generate_public_key,
2494+ .compute_shared_secret = curve25519_compute_shared_secret,
2495+ .max_size = curve25519_max_size,
2496+};
2497+
2498+static int __init curve25519_mod_init(void)
2499+{
2500+ if (boot_cpu_has(X86_FEATURE_BMI2))
2501+ static_branch_enable(&curve25519_use_bmi2);
2502+ else if (boot_cpu_has(X86_FEATURE_ADX))
2503+ static_branch_enable(&curve25519_use_adx);
2504+ else
2505+ return 0;
2506+ return crypto_register_kpp(&curve25519_alg);
2507+}
2508+
2509+static void __exit curve25519_mod_exit(void)
2510+{
2511+ if (boot_cpu_has(X86_FEATURE_BMI2) ||
2512+ boot_cpu_has(X86_FEATURE_ADX))
2513+ crypto_unregister_kpp(&curve25519_alg);
2514+}
2515+
2516+module_init(curve25519_mod_init);
2517+module_exit(curve25519_mod_exit);
2518+
2519+MODULE_ALIAS_CRYPTO("curve25519");
2520+MODULE_ALIAS_CRYPTO("curve25519-x86");
2521+MODULE_LICENSE("GPL v2");
2522--- a/crypto/Kconfig
2523+++ b/crypto/Kconfig
2524@@ -269,6 +269,12 @@ config CRYPTO_CURVE25519
2525 select CRYPTO_KPP
2526 select CRYPTO_LIB_CURVE25519_GENERIC
2527
2528+config CRYPTO_CURVE25519_X86
2529+ tristate "x86_64 accelerated Curve25519 scalar multiplication library"
2530+ depends on X86 && 64BIT
2531+ select CRYPTO_LIB_CURVE25519_GENERIC
2532+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
2533+
2534 comment "Authenticated Encryption with Associated Data"
2535
2536 config CRYPTO_CCM