| b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | Date: Fri, 8 Nov 2019 13:22:36 +0100 |
| 4 | Subject: [PATCH] crypto: curve25519 - x86_64 library and KPP implementations |
| 5 | MIME-Version: 1.0 |
| 6 | Content-Type: text/plain; charset=UTF-8 |
| 7 | Content-Transfer-Encoding: 8bit |
| 8 | |
| 9 | commit bb611bdfd6be34d9f822c73305fcc83720499d38 upstream. |
| 10 | |
| 11 | This implementation is the fastest available x86_64 implementation, and |
| 12 | unlike Sandy2x, it doesn't requie use of the floating point registers at |
| 13 | all. Instead it makes use of BMI2 and ADX, available on recent |
| 14 | microarchitectures. The implementation was written by Armando |
| 15 | Faz-Hernández with contributions (upstream) from Samuel Neves and me, |
| 16 | in addition to further changes in the kernel implementation from us. |
| 17 | |
| 18 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 19 | Signed-off-by: Samuel Neves <sneves@dei.uc.pt> |
| 20 | Co-developed-by: Samuel Neves <sneves@dei.uc.pt> |
| 21 | [ardb: - move to arch/x86/crypto |
| 22 | - wire into lib/crypto framework |
| 23 | - implement crypto API KPP hooks ] |
| 24 | Signed-off-by: Ard Biesheuvel <ardb@kernel.org> |
| 25 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> |
| 26 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 27 | --- |
| 28 | arch/x86/crypto/Makefile | 1 + |
| 29 | arch/x86/crypto/curve25519-x86_64.c | 2475 +++++++++++++++++++++++++++ |
| 30 | crypto/Kconfig | 6 + |
| 31 | 3 files changed, 2482 insertions(+) |
| 32 | create mode 100644 arch/x86/crypto/curve25519-x86_64.c |
| 33 | |
| 34 | --- a/arch/x86/crypto/Makefile |
| 35 | +++ b/arch/x86/crypto/Makefile |
| 36 | @@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) |
| 37 | |
| 38 | obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o |
| 39 | obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o |
| 40 | +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o |
| 41 | |
| 42 | # These modules require assembler to support AVX. |
| 43 | ifeq ($(avx_supported),yes) |
| 44 | --- /dev/null |
| 45 | +++ b/arch/x86/crypto/curve25519-x86_64.c |
| 46 | @@ -0,0 +1,2475 @@ |
| 47 | +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
| 48 | +/* |
| 49 | + * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. |
| 50 | + * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 51 | + * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. |
| 52 | + */ |
| 53 | + |
| 54 | +#include <crypto/curve25519.h> |
| 55 | +#include <crypto/internal/kpp.h> |
| 56 | + |
| 57 | +#include <linux/types.h> |
| 58 | +#include <linux/jump_label.h> |
| 59 | +#include <linux/kernel.h> |
| 60 | +#include <linux/module.h> |
| 61 | + |
| 62 | +#include <asm/cpufeature.h> |
| 63 | +#include <asm/processor.h> |
| 64 | + |
| 65 | +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); |
| 66 | +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); |
| 67 | + |
| 68 | +enum { NUM_WORDS_ELTFP25519 = 4 }; |
| 69 | +typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; |
| 70 | +typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; |
| 71 | + |
| 72 | +#define mul_eltfp25519_1w_adx(c, a, b) do { \ |
| 73 | + mul_256x256_integer_adx(m.buffer, a, b); \ |
| 74 | + red_eltfp25519_1w_adx(c, m.buffer); \ |
| 75 | +} while (0) |
| 76 | + |
| 77 | +#define mul_eltfp25519_1w_bmi2(c, a, b) do { \ |
| 78 | + mul_256x256_integer_bmi2(m.buffer, a, b); \ |
| 79 | + red_eltfp25519_1w_bmi2(c, m.buffer); \ |
| 80 | +} while (0) |
| 81 | + |
| 82 | +#define sqr_eltfp25519_1w_adx(a) do { \ |
| 83 | + sqr_256x256_integer_adx(m.buffer, a); \ |
| 84 | + red_eltfp25519_1w_adx(a, m.buffer); \ |
| 85 | +} while (0) |
| 86 | + |
| 87 | +#define sqr_eltfp25519_1w_bmi2(a) do { \ |
| 88 | + sqr_256x256_integer_bmi2(m.buffer, a); \ |
| 89 | + red_eltfp25519_1w_bmi2(a, m.buffer); \ |
| 90 | +} while (0) |
| 91 | + |
| 92 | +#define mul_eltfp25519_2w_adx(c, a, b) do { \ |
| 93 | + mul2_256x256_integer_adx(m.buffer, a, b); \ |
| 94 | + red_eltfp25519_2w_adx(c, m.buffer); \ |
| 95 | +} while (0) |
| 96 | + |
| 97 | +#define mul_eltfp25519_2w_bmi2(c, a, b) do { \ |
| 98 | + mul2_256x256_integer_bmi2(m.buffer, a, b); \ |
| 99 | + red_eltfp25519_2w_bmi2(c, m.buffer); \ |
| 100 | +} while (0) |
| 101 | + |
| 102 | +#define sqr_eltfp25519_2w_adx(a) do { \ |
| 103 | + sqr2_256x256_integer_adx(m.buffer, a); \ |
| 104 | + red_eltfp25519_2w_adx(a, m.buffer); \ |
| 105 | +} while (0) |
| 106 | + |
| 107 | +#define sqr_eltfp25519_2w_bmi2(a) do { \ |
| 108 | + sqr2_256x256_integer_bmi2(m.buffer, a); \ |
| 109 | + red_eltfp25519_2w_bmi2(a, m.buffer); \ |
| 110 | +} while (0) |
| 111 | + |
| 112 | +#define sqrn_eltfp25519_1w_adx(a, times) do { \ |
| 113 | + int ____counter = (times); \ |
| 114 | + while (____counter-- > 0) \ |
| 115 | + sqr_eltfp25519_1w_adx(a); \ |
| 116 | +} while (0) |
| 117 | + |
| 118 | +#define sqrn_eltfp25519_1w_bmi2(a, times) do { \ |
| 119 | + int ____counter = (times); \ |
| 120 | + while (____counter-- > 0) \ |
| 121 | + sqr_eltfp25519_1w_bmi2(a); \ |
| 122 | +} while (0) |
| 123 | + |
| 124 | +#define copy_eltfp25519_1w(C, A) do { \ |
| 125 | + (C)[0] = (A)[0]; \ |
| 126 | + (C)[1] = (A)[1]; \ |
| 127 | + (C)[2] = (A)[2]; \ |
| 128 | + (C)[3] = (A)[3]; \ |
| 129 | +} while (0) |
| 130 | + |
| 131 | +#define setzero_eltfp25519_1w(C) do { \ |
| 132 | + (C)[0] = 0; \ |
| 133 | + (C)[1] = 0; \ |
| 134 | + (C)[2] = 0; \ |
| 135 | + (C)[3] = 0; \ |
| 136 | +} while (0) |
| 137 | + |
| 138 | +__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { |
| 139 | + /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, |
| 140 | + 0xffffffffffffffffUL, 0x5fffffffffffffffUL, |
| 141 | + /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, |
| 142 | + 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, |
| 143 | + /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, |
| 144 | + 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, |
| 145 | + /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, |
| 146 | + 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, |
| 147 | + /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, |
| 148 | + 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, |
| 149 | + /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, |
| 150 | + 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, |
| 151 | + /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, |
| 152 | + 0xc1c20d06231f7614UL, 0x2938218da274f972UL, |
| 153 | + /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, |
| 154 | + 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, |
| 155 | + /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, |
| 156 | + 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, |
| 157 | + /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, |
| 158 | + 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, |
| 159 | + /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, |
| 160 | + 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, |
| 161 | + /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, |
| 162 | + 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, |
| 163 | + /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, |
| 164 | + 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, |
| 165 | + /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, |
| 166 | + 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, |
| 167 | + /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, |
| 168 | + 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, |
| 169 | + /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, |
| 170 | + 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, |
| 171 | + /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, |
| 172 | + 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, |
| 173 | + /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, |
| 174 | + 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, |
| 175 | + /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, |
| 176 | + 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, |
| 177 | + /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, |
| 178 | + 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, |
| 179 | + /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, |
| 180 | + 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, |
| 181 | + /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, |
| 182 | + 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, |
| 183 | + /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, |
| 184 | + 0x23758739f630a257UL, 0x295a407a01a78580UL, |
| 185 | + /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, |
| 186 | + 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, |
| 187 | + /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, |
| 188 | + 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, |
| 189 | + /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, |
| 190 | + 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, |
| 191 | + /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, |
| 192 | + 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, |
| 193 | + /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, |
| 194 | + 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, |
| 195 | + /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, |
| 196 | + 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, |
| 197 | + /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, |
| 198 | + 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, |
| 199 | + /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, |
| 200 | + 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, |
| 201 | + /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, |
| 202 | + 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, |
| 203 | + /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, |
| 204 | + 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, |
| 205 | + /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, |
| 206 | + 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, |
| 207 | + /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, |
| 208 | + 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, |
| 209 | + /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, |
| 210 | + 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, |
| 211 | + /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, |
| 212 | + 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, |
| 213 | + /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, |
| 214 | + 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, |
| 215 | + /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, |
| 216 | + 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, |
| 217 | + /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, |
| 218 | + 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, |
| 219 | + /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, |
| 220 | + 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, |
| 221 | + /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, |
| 222 | + 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, |
| 223 | + /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, |
| 224 | + 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, |
| 225 | + /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, |
| 226 | + 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, |
| 227 | + /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, |
| 228 | + 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, |
| 229 | + /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, |
| 230 | + 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, |
| 231 | + /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, |
| 232 | + 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, |
| 233 | + /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, |
| 234 | + 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, |
| 235 | + /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, |
| 236 | + 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, |
| 237 | + /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, |
| 238 | + 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, |
| 239 | + /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, |
| 240 | + 0xc189218075e91436UL, 0x6d9284169b3b8484UL, |
| 241 | + /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, |
| 242 | + 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, |
| 243 | + /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, |
| 244 | + 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, |
| 245 | + /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, |
| 246 | + 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, |
| 247 | + /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, |
| 248 | + 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, |
| 249 | + /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, |
| 250 | + 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, |
| 251 | + /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, |
| 252 | + 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, |
| 253 | + /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, |
| 254 | + 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, |
| 255 | + /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, |
| 256 | + 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, |
| 257 | + /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, |
| 258 | + 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, |
| 259 | + /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, |
| 260 | + 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, |
| 261 | + /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, |
| 262 | + 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, |
| 263 | + /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, |
| 264 | + 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, |
| 265 | + /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, |
| 266 | + 0x25232973322dbef4UL, 0x445dc4758c17f770UL, |
| 267 | + /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, |
| 268 | + 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, |
| 269 | + /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, |
| 270 | + 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, |
| 271 | + /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, |
| 272 | + 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, |
| 273 | + /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, |
| 274 | + 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, |
| 275 | + /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, |
| 276 | + 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, |
| 277 | + /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, |
| 278 | + 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, |
| 279 | + /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, |
| 280 | + 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, |
| 281 | + /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, |
| 282 | + 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, |
| 283 | + /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, |
| 284 | + 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, |
| 285 | + /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, |
| 286 | + 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, |
| 287 | + /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, |
| 288 | + 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, |
| 289 | + /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, |
| 290 | + 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, |
| 291 | + /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, |
| 292 | + 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, |
| 293 | + /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, |
| 294 | + 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, |
| 295 | + /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, |
| 296 | + 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, |
| 297 | + /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, |
| 298 | + 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, |
| 299 | + /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, |
| 300 | + 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, |
| 301 | + /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, |
| 302 | + 0x894d1d855ae52359UL, 0x68e122157b743d69UL, |
| 303 | + /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, |
| 304 | + 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, |
| 305 | + /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, |
| 306 | + 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, |
| 307 | + /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, |
| 308 | + 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, |
| 309 | + /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, |
| 310 | + 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, |
| 311 | + /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, |
| 312 | + 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, |
| 313 | + /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, |
| 314 | + 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, |
| 315 | + /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, |
| 316 | + 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, |
| 317 | + /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, |
| 318 | + 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, |
| 319 | + /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, |
| 320 | + 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, |
| 321 | + /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, |
| 322 | + 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, |
| 323 | + /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, |
| 324 | + 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, |
| 325 | + /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, |
| 326 | + 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, |
| 327 | + /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, |
| 328 | + 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, |
| 329 | + /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, |
| 330 | + 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, |
| 331 | + /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, |
| 332 | + 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, |
| 333 | + /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, |
| 334 | + 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, |
| 335 | + /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, |
| 336 | + 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, |
| 337 | + /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, |
| 338 | + 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, |
| 339 | + /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, |
| 340 | + 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, |
| 341 | + /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, |
| 342 | + 0x4a497962066e6043UL, 0x705b3aab41355b44UL, |
| 343 | + /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, |
| 344 | + 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, |
| 345 | + /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, |
| 346 | + 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, |
| 347 | + /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, |
| 348 | + 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, |
| 349 | + /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, |
| 350 | + 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, |
| 351 | + /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, |
| 352 | + 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, |
| 353 | + /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, |
| 354 | + 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, |
| 355 | + /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, |
| 356 | + 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, |
| 357 | + /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, |
| 358 | + 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, |
| 359 | + /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, |
| 360 | + 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, |
| 361 | + /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, |
| 362 | + 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, |
| 363 | + /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, |
| 364 | + 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, |
| 365 | + /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, |
| 366 | + 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, |
| 367 | + /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, |
| 368 | + 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, |
| 369 | + /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, |
| 370 | + 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, |
| 371 | + /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, |
| 372 | + 0x508e862f121692fcUL, 0x3a81907fa093c291UL, |
| 373 | + /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, |
| 374 | + 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, |
| 375 | + /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, |
| 376 | + 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, |
| 377 | + /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, |
| 378 | + 0xe488de11d761e352UL, 0x0e878a01a085545cUL, |
| 379 | + /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, |
| 380 | + 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, |
| 381 | + /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, |
| 382 | + 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, |
| 383 | + /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, |
| 384 | + 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, |
| 385 | + /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, |
| 386 | + 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, |
| 387 | + /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, |
| 388 | + 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, |
| 389 | + /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, |
| 390 | + 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, |
| 391 | + /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, |
| 392 | + 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, |
| 393 | + /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, |
| 394 | + 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, |
| 395 | + /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, |
| 396 | + 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, |
| 397 | + /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, |
| 398 | + 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, |
| 399 | + /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, |
| 400 | + 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, |
| 401 | + /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, |
| 402 | + 0x266fd5809208f294UL, 0x5c847085619a26b9UL, |
| 403 | + /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, |
| 404 | + 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, |
| 405 | + /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, |
| 406 | + 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, |
| 407 | + /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, |
| 408 | + 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, |
| 409 | + /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, |
| 410 | + 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, |
| 411 | + /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, |
| 412 | + 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, |
| 413 | + /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, |
| 414 | + 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, |
| 415 | + /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, |
| 416 | + 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, |
| 417 | + /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, |
| 418 | + 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, |
| 419 | + /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, |
| 420 | + 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, |
| 421 | + /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, |
| 422 | + 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, |
| 423 | + /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, |
| 424 | + 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, |
| 425 | + /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, |
| 426 | + 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, |
| 427 | + /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, |
| 428 | + 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, |
| 429 | + /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, |
| 430 | + 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, |
| 431 | + /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, |
| 432 | + 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, |
| 433 | + /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, |
| 434 | + 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, |
| 435 | + /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, |
| 436 | + 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, |
| 437 | + /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, |
| 438 | + 0x52d17436309d4253UL, 0x356f97e13efae576UL, |
| 439 | + /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, |
| 440 | + 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, |
| 441 | + /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, |
| 442 | + 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, |
| 443 | + /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, |
| 444 | + 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, |
| 445 | + /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, |
| 446 | + 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, |
| 447 | + /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, |
| 448 | + 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, |
| 449 | + /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, |
| 450 | + 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, |
| 451 | + /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, |
| 452 | + 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, |
| 453 | + /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, |
| 454 | + 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, |
| 455 | + /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, |
| 456 | + 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, |
| 457 | + /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, |
| 458 | + 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, |
| 459 | + /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, |
| 460 | + 0x497d723f802e88e1UL, 0x30684dea602f408dUL, |
| 461 | + /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, |
| 462 | + 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, |
| 463 | + /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, |
| 464 | + 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, |
| 465 | + /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, |
| 466 | + 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, |
| 467 | + /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, |
| 468 | + 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, |
| 469 | + /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, |
| 470 | + 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, |
| 471 | + /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, |
| 472 | + 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, |
| 473 | + /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, |
| 474 | + 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, |
| 475 | + /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, |
| 476 | + 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, |
| 477 | + /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, |
| 478 | + 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, |
| 479 | + /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, |
| 480 | + 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, |
| 481 | + /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, |
| 482 | + 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, |
| 483 | + /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, |
| 484 | + 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, |
| 485 | + /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, |
| 486 | + 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, |
| 487 | + /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, |
| 488 | + 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, |
| 489 | + /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, |
| 490 | + 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, |
| 491 | + /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, |
| 492 | + 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, |
| 493 | + /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, |
| 494 | + 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, |
| 495 | + /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, |
| 496 | + 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, |
| 497 | + /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, |
| 498 | + 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, |
| 499 | + /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, |
| 500 | + 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, |
| 501 | + /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, |
| 502 | + 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, |
| 503 | + /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, |
| 504 | + 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, |
| 505 | + /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, |
| 506 | + 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, |
| 507 | + /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, |
| 508 | + 0x81004b71e33cc191UL, 0x44e6be345122803cUL, |
| 509 | + /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, |
| 510 | + 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, |
| 511 | + /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, |
| 512 | + 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, |
| 513 | + /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, |
| 514 | + 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, |
| 515 | + /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, |
| 516 | + 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, |
| 517 | + /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, |
| 518 | + 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, |
| 519 | + /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, |
| 520 | + 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, |
| 521 | + /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, |
| 522 | + 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, |
| 523 | + /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, |
| 524 | + 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, |
| 525 | + /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, |
| 526 | + 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, |
| 527 | + /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, |
| 528 | + 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, |
| 529 | + /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, |
| 530 | + 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, |
| 531 | + /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, |
| 532 | + 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, |
| 533 | + /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, |
| 534 | + 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, |
| 535 | + /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, |
| 536 | + 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, |
| 537 | + /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, |
| 538 | + 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, |
| 539 | + /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, |
| 540 | + 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, |
| 541 | + /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, |
| 542 | + 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, |
| 543 | + /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, |
| 544 | + 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, |
| 545 | + /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, |
| 546 | + 0x33979624f0e917beUL, 0x2c018dc527356b30UL, |
| 547 | + /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, |
| 548 | + 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, |
| 549 | + /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, |
| 550 | + 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, |
| 551 | + /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, |
| 552 | + 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, |
| 553 | + /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, |
| 554 | + 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, |
| 555 | + /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, |
| 556 | + 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, |
| 557 | + /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, |
| 558 | + 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, |
| 559 | + /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, |
| 560 | + 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, |
| 561 | + /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, |
| 562 | + 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, |
| 563 | + /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, |
| 564 | + 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, |
| 565 | + /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, |
| 566 | + 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, |
| 567 | + /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, |
| 568 | + 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, |
| 569 | + /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, |
| 570 | + 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, |
| 571 | + /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, |
| 572 | + 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, |
| 573 | + /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, |
| 574 | + 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, |
| 575 | + /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, |
| 576 | + 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, |
| 577 | + /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, |
| 578 | + 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, |
| 579 | + /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, |
| 580 | + 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, |
| 581 | + /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, |
| 582 | + 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, |
| 583 | + /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, |
| 584 | + 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, |
| 585 | + /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, |
| 586 | + 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, |
| 587 | + /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, |
| 588 | + 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, |
| 589 | + /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, |
| 590 | + 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, |
| 591 | + /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, |
| 592 | + 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, |
| 593 | + /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, |
| 594 | + 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, |
| 595 | + /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, |
| 596 | + 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, |
| 597 | + /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, |
| 598 | + 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, |
| 599 | + /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, |
| 600 | + 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, |
| 601 | + /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, |
| 602 | + 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, |
| 603 | + /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, |
| 604 | + 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, |
| 605 | + /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, |
| 606 | + 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, |
| 607 | + /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, |
| 608 | + 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, |
| 609 | + /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, |
| 610 | + 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, |
| 611 | + /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, |
| 612 | + 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, |
| 613 | + /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, |
| 614 | + 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, |
| 615 | + /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, |
| 616 | + 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, |
| 617 | + /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, |
| 618 | + 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, |
| 619 | + /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, |
| 620 | + 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, |
| 621 | + /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, |
| 622 | + 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, |
| 623 | + /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, |
| 624 | + 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, |
| 625 | + /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, |
| 626 | + 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, |
| 627 | + /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, |
| 628 | + 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, |
| 629 | + /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, |
| 630 | + 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, |
| 631 | + /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, |
| 632 | + 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, |
| 633 | + /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, |
| 634 | + 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, |
| 635 | + /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, |
| 636 | + 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, |
| 637 | + /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, |
| 638 | + 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, |
| 639 | + /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, |
| 640 | + 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, |
| 641 | + /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, |
| 642 | + 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL |
| 643 | +}; |
| 644 | + |
| 645 | +/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] |
| 646 | + * a is two 256-bit integers: a0[0:3] and a1[4:7] |
| 647 | + * b is two 256-bit integers: b0[0:3] and b1[4:7] |
| 648 | + */ |
| 649 | +static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, |
| 650 | + const u64 *const b) |
| 651 | +{ |
| 652 | + asm volatile( |
| 653 | + "xorl %%r14d, %%r14d ;" |
| 654 | + "movq (%1), %%rdx; " /* A[0] */ |
| 655 | + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ |
| 656 | + "xorl %%r10d, %%r10d ;" |
| 657 | + "movq %%r8, (%0) ;" |
| 658 | + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ |
| 659 | + "adox %%r10, %%r15 ;" |
| 660 | + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ |
| 661 | + "adox %%r8, %%rax ;" |
| 662 | + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ |
| 663 | + "adox %%r10, %%rbx ;" |
| 664 | + /******************************************/ |
| 665 | + "adox %%r14, %%rcx ;" |
| 666 | + |
| 667 | + "movq 8(%1), %%rdx; " /* A[1] */ |
| 668 | + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ |
| 669 | + "adox %%r15, %%r8 ;" |
| 670 | + "movq %%r8, 8(%0) ;" |
| 671 | + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ |
| 672 | + "adox %%r10, %%r9 ;" |
| 673 | + "adcx %%r9, %%rax ;" |
| 674 | + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ |
| 675 | + "adox %%r8, %%r11 ;" |
| 676 | + "adcx %%r11, %%rbx ;" |
| 677 | + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ |
| 678 | + "adox %%r10, %%r13 ;" |
| 679 | + "adcx %%r13, %%rcx ;" |
| 680 | + /******************************************/ |
| 681 | + "adox %%r14, %%r15 ;" |
| 682 | + "adcx %%r14, %%r15 ;" |
| 683 | + |
| 684 | + "movq 16(%1), %%rdx; " /* A[2] */ |
| 685 | + "xorl %%r10d, %%r10d ;" |
| 686 | + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ |
| 687 | + "adox %%rax, %%r8 ;" |
| 688 | + "movq %%r8, 16(%0) ;" |
| 689 | + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ |
| 690 | + "adox %%r10, %%r9 ;" |
| 691 | + "adcx %%r9, %%rbx ;" |
| 692 | + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ |
| 693 | + "adox %%r8, %%r11 ;" |
| 694 | + "adcx %%r11, %%rcx ;" |
| 695 | + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ |
| 696 | + "adox %%r10, %%r13 ;" |
| 697 | + "adcx %%r13, %%r15 ;" |
| 698 | + /******************************************/ |
| 699 | + "adox %%r14, %%rax ;" |
| 700 | + "adcx %%r14, %%rax ;" |
| 701 | + |
| 702 | + "movq 24(%1), %%rdx; " /* A[3] */ |
| 703 | + "xorl %%r10d, %%r10d ;" |
| 704 | + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ |
| 705 | + "adox %%rbx, %%r8 ;" |
| 706 | + "movq %%r8, 24(%0) ;" |
| 707 | + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ |
| 708 | + "adox %%r10, %%r9 ;" |
| 709 | + "adcx %%r9, %%rcx ;" |
| 710 | + "movq %%rcx, 32(%0) ;" |
| 711 | + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ |
| 712 | + "adox %%r8, %%r11 ;" |
| 713 | + "adcx %%r11, %%r15 ;" |
| 714 | + "movq %%r15, 40(%0) ;" |
| 715 | + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ |
| 716 | + "adox %%r10, %%r13 ;" |
| 717 | + "adcx %%r13, %%rax ;" |
| 718 | + "movq %%rax, 48(%0) ;" |
| 719 | + /******************************************/ |
| 720 | + "adox %%r14, %%rbx ;" |
| 721 | + "adcx %%r14, %%rbx ;" |
| 722 | + "movq %%rbx, 56(%0) ;" |
| 723 | + |
| 724 | + "movq 32(%1), %%rdx; " /* C[0] */ |
| 725 | + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ |
| 726 | + "xorl %%r10d, %%r10d ;" |
| 727 | + "movq %%r8, 64(%0);" |
| 728 | + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ |
| 729 | + "adox %%r10, %%r15 ;" |
| 730 | + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ |
| 731 | + "adox %%r8, %%rax ;" |
| 732 | + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ |
| 733 | + "adox %%r10, %%rbx ;" |
| 734 | + /******************************************/ |
| 735 | + "adox %%r14, %%rcx ;" |
| 736 | + |
| 737 | + "movq 40(%1), %%rdx; " /* C[1] */ |
| 738 | + "xorl %%r10d, %%r10d ;" |
| 739 | + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ |
| 740 | + "adox %%r15, %%r8 ;" |
| 741 | + "movq %%r8, 72(%0);" |
| 742 | + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ |
| 743 | + "adox %%r10, %%r9 ;" |
| 744 | + "adcx %%r9, %%rax ;" |
| 745 | + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ |
| 746 | + "adox %%r8, %%r11 ;" |
| 747 | + "adcx %%r11, %%rbx ;" |
| 748 | + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ |
| 749 | + "adox %%r10, %%r13 ;" |
| 750 | + "adcx %%r13, %%rcx ;" |
| 751 | + /******************************************/ |
| 752 | + "adox %%r14, %%r15 ;" |
| 753 | + "adcx %%r14, %%r15 ;" |
| 754 | + |
| 755 | + "movq 48(%1), %%rdx; " /* C[2] */ |
| 756 | + "xorl %%r10d, %%r10d ;" |
| 757 | + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ |
| 758 | + "adox %%rax, %%r8 ;" |
| 759 | + "movq %%r8, 80(%0);" |
| 760 | + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ |
| 761 | + "adox %%r10, %%r9 ;" |
| 762 | + "adcx %%r9, %%rbx ;" |
| 763 | + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ |
| 764 | + "adox %%r8, %%r11 ;" |
| 765 | + "adcx %%r11, %%rcx ;" |
| 766 | + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ |
| 767 | + "adox %%r10, %%r13 ;" |
| 768 | + "adcx %%r13, %%r15 ;" |
| 769 | + /******************************************/ |
| 770 | + "adox %%r14, %%rax ;" |
| 771 | + "adcx %%r14, %%rax ;" |
| 772 | + |
| 773 | + "movq 56(%1), %%rdx; " /* C[3] */ |
| 774 | + "xorl %%r10d, %%r10d ;" |
| 775 | + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ |
| 776 | + "adox %%rbx, %%r8 ;" |
| 777 | + "movq %%r8, 88(%0);" |
| 778 | + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ |
| 779 | + "adox %%r10, %%r9 ;" |
| 780 | + "adcx %%r9, %%rcx ;" |
| 781 | + "movq %%rcx, 96(%0) ;" |
| 782 | + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ |
| 783 | + "adox %%r8, %%r11 ;" |
| 784 | + "adcx %%r11, %%r15 ;" |
| 785 | + "movq %%r15, 104(%0) ;" |
| 786 | + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ |
| 787 | + "adox %%r10, %%r13 ;" |
| 788 | + "adcx %%r13, %%rax ;" |
| 789 | + "movq %%rax, 112(%0) ;" |
| 790 | + /******************************************/ |
| 791 | + "adox %%r14, %%rbx ;" |
| 792 | + "adcx %%r14, %%rbx ;" |
| 793 | + "movq %%rbx, 120(%0) ;" |
| 794 | + : |
| 795 | + : "r"(c), "r"(a), "r"(b) |
| 796 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 797 | + "%r10", "%r11", "%r13", "%r14", "%r15"); |
| 798 | +} |
| 799 | + |
| 800 | +static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, |
| 801 | + const u64 *const b) |
| 802 | +{ |
| 803 | + asm volatile( |
| 804 | + "movq (%1), %%rdx; " /* A[0] */ |
| 805 | + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ |
| 806 | + "movq %%r8, (%0) ;" |
| 807 | + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ |
| 808 | + "addq %%r10, %%r15 ;" |
| 809 | + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ |
| 810 | + "adcq %%r8, %%rax ;" |
| 811 | + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ |
| 812 | + "adcq %%r10, %%rbx ;" |
| 813 | + /******************************************/ |
| 814 | + "adcq $0, %%rcx ;" |
| 815 | + |
| 816 | + "movq 8(%1), %%rdx; " /* A[1] */ |
| 817 | + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ |
| 818 | + "addq %%r15, %%r8 ;" |
| 819 | + "movq %%r8, 8(%0) ;" |
| 820 | + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ |
| 821 | + "adcq %%r10, %%r9 ;" |
| 822 | + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ |
| 823 | + "adcq %%r8, %%r11 ;" |
| 824 | + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ |
| 825 | + "adcq %%r10, %%r13 ;" |
| 826 | + /******************************************/ |
| 827 | + "adcq $0, %%r15 ;" |
| 828 | + |
| 829 | + "addq %%r9, %%rax ;" |
| 830 | + "adcq %%r11, %%rbx ;" |
| 831 | + "adcq %%r13, %%rcx ;" |
| 832 | + "adcq $0, %%r15 ;" |
| 833 | + |
| 834 | + "movq 16(%1), %%rdx; " /* A[2] */ |
| 835 | + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ |
| 836 | + "addq %%rax, %%r8 ;" |
| 837 | + "movq %%r8, 16(%0) ;" |
| 838 | + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ |
| 839 | + "adcq %%r10, %%r9 ;" |
| 840 | + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ |
| 841 | + "adcq %%r8, %%r11 ;" |
| 842 | + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ |
| 843 | + "adcq %%r10, %%r13 ;" |
| 844 | + /******************************************/ |
| 845 | + "adcq $0, %%rax ;" |
| 846 | + |
| 847 | + "addq %%r9, %%rbx ;" |
| 848 | + "adcq %%r11, %%rcx ;" |
| 849 | + "adcq %%r13, %%r15 ;" |
| 850 | + "adcq $0, %%rax ;" |
| 851 | + |
| 852 | + "movq 24(%1), %%rdx; " /* A[3] */ |
| 853 | + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ |
| 854 | + "addq %%rbx, %%r8 ;" |
| 855 | + "movq %%r8, 24(%0) ;" |
| 856 | + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ |
| 857 | + "adcq %%r10, %%r9 ;" |
| 858 | + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ |
| 859 | + "adcq %%r8, %%r11 ;" |
| 860 | + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ |
| 861 | + "adcq %%r10, %%r13 ;" |
| 862 | + /******************************************/ |
| 863 | + "adcq $0, %%rbx ;" |
| 864 | + |
| 865 | + "addq %%r9, %%rcx ;" |
| 866 | + "movq %%rcx, 32(%0) ;" |
| 867 | + "adcq %%r11, %%r15 ;" |
| 868 | + "movq %%r15, 40(%0) ;" |
| 869 | + "adcq %%r13, %%rax ;" |
| 870 | + "movq %%rax, 48(%0) ;" |
| 871 | + "adcq $0, %%rbx ;" |
| 872 | + "movq %%rbx, 56(%0) ;" |
| 873 | + |
| 874 | + "movq 32(%1), %%rdx; " /* C[0] */ |
| 875 | + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ |
| 876 | + "movq %%r8, 64(%0) ;" |
| 877 | + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ |
| 878 | + "addq %%r10, %%r15 ;" |
| 879 | + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ |
| 880 | + "adcq %%r8, %%rax ;" |
| 881 | + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ |
| 882 | + "adcq %%r10, %%rbx ;" |
| 883 | + /******************************************/ |
| 884 | + "adcq $0, %%rcx ;" |
| 885 | + |
| 886 | + "movq 40(%1), %%rdx; " /* C[1] */ |
| 887 | + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ |
| 888 | + "addq %%r15, %%r8 ;" |
| 889 | + "movq %%r8, 72(%0) ;" |
| 890 | + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ |
| 891 | + "adcq %%r10, %%r9 ;" |
| 892 | + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ |
| 893 | + "adcq %%r8, %%r11 ;" |
| 894 | + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ |
| 895 | + "adcq %%r10, %%r13 ;" |
| 896 | + /******************************************/ |
| 897 | + "adcq $0, %%r15 ;" |
| 898 | + |
| 899 | + "addq %%r9, %%rax ;" |
| 900 | + "adcq %%r11, %%rbx ;" |
| 901 | + "adcq %%r13, %%rcx ;" |
| 902 | + "adcq $0, %%r15 ;" |
| 903 | + |
| 904 | + "movq 48(%1), %%rdx; " /* C[2] */ |
| 905 | + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ |
| 906 | + "addq %%rax, %%r8 ;" |
| 907 | + "movq %%r8, 80(%0) ;" |
| 908 | + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ |
| 909 | + "adcq %%r10, %%r9 ;" |
| 910 | + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ |
| 911 | + "adcq %%r8, %%r11 ;" |
| 912 | + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ |
| 913 | + "adcq %%r10, %%r13 ;" |
| 914 | + /******************************************/ |
| 915 | + "adcq $0, %%rax ;" |
| 916 | + |
| 917 | + "addq %%r9, %%rbx ;" |
| 918 | + "adcq %%r11, %%rcx ;" |
| 919 | + "adcq %%r13, %%r15 ;" |
| 920 | + "adcq $0, %%rax ;" |
| 921 | + |
| 922 | + "movq 56(%1), %%rdx; " /* C[3] */ |
| 923 | + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ |
| 924 | + "addq %%rbx, %%r8 ;" |
| 925 | + "movq %%r8, 88(%0) ;" |
| 926 | + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ |
| 927 | + "adcq %%r10, %%r9 ;" |
| 928 | + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ |
| 929 | + "adcq %%r8, %%r11 ;" |
| 930 | + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ |
| 931 | + "adcq %%r10, %%r13 ;" |
| 932 | + /******************************************/ |
| 933 | + "adcq $0, %%rbx ;" |
| 934 | + |
| 935 | + "addq %%r9, %%rcx ;" |
| 936 | + "movq %%rcx, 96(%0) ;" |
| 937 | + "adcq %%r11, %%r15 ;" |
| 938 | + "movq %%r15, 104(%0) ;" |
| 939 | + "adcq %%r13, %%rax ;" |
| 940 | + "movq %%rax, 112(%0) ;" |
| 941 | + "adcq $0, %%rbx ;" |
| 942 | + "movq %%rbx, 120(%0) ;" |
| 943 | + : |
| 944 | + : "r"(c), "r"(a), "r"(b) |
| 945 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 946 | + "%r10", "%r11", "%r13", "%r15"); |
| 947 | +} |
| 948 | + |
| 949 | +static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) |
| 950 | +{ |
| 951 | + asm volatile( |
| 952 | + "movq (%1), %%rdx ;" /* A[0] */ |
| 953 | + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ |
| 954 | + "xorl %%r15d, %%r15d;" |
| 955 | + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ |
| 956 | + "adcx %%r14, %%r9 ;" |
| 957 | + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ |
| 958 | + "adcx %%rax, %%r10 ;" |
| 959 | + "movq 24(%1), %%rdx ;" /* A[3] */ |
| 960 | + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ |
| 961 | + "adcx %%rcx, %%r11 ;" |
| 962 | + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ |
| 963 | + "adcx %%rax, %%rbx ;" |
| 964 | + "movq 8(%1), %%rdx ;" /* A[1] */ |
| 965 | + "adcx %%r15, %%r13 ;" |
| 966 | + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ |
| 967 | + "movq $0, %%r14 ;" |
| 968 | + /******************************************/ |
| 969 | + "adcx %%r15, %%r14 ;" |
| 970 | + |
| 971 | + "xorl %%r15d, %%r15d;" |
| 972 | + "adox %%rax, %%r10 ;" |
| 973 | + "adcx %%r8, %%r8 ;" |
| 974 | + "adox %%rcx, %%r11 ;" |
| 975 | + "adcx %%r9, %%r9 ;" |
| 976 | + "adox %%r15, %%rbx ;" |
| 977 | + "adcx %%r10, %%r10 ;" |
| 978 | + "adox %%r15, %%r13 ;" |
| 979 | + "adcx %%r11, %%r11 ;" |
| 980 | + "adox %%r15, %%r14 ;" |
| 981 | + "adcx %%rbx, %%rbx ;" |
| 982 | + "adcx %%r13, %%r13 ;" |
| 983 | + "adcx %%r14, %%r14 ;" |
| 984 | + |
| 985 | + "movq (%1), %%rdx ;" |
| 986 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ |
| 987 | + /*******************/ |
| 988 | + "movq %%rax, 0(%0) ;" |
| 989 | + "addq %%rcx, %%r8 ;" |
| 990 | + "movq %%r8, 8(%0) ;" |
| 991 | + "movq 8(%1), %%rdx ;" |
| 992 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ |
| 993 | + "adcq %%rax, %%r9 ;" |
| 994 | + "movq %%r9, 16(%0) ;" |
| 995 | + "adcq %%rcx, %%r10 ;" |
| 996 | + "movq %%r10, 24(%0) ;" |
| 997 | + "movq 16(%1), %%rdx ;" |
| 998 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ |
| 999 | + "adcq %%rax, %%r11 ;" |
| 1000 | + "movq %%r11, 32(%0) ;" |
| 1001 | + "adcq %%rcx, %%rbx ;" |
| 1002 | + "movq %%rbx, 40(%0) ;" |
| 1003 | + "movq 24(%1), %%rdx ;" |
| 1004 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ |
| 1005 | + "adcq %%rax, %%r13 ;" |
| 1006 | + "movq %%r13, 48(%0) ;" |
| 1007 | + "adcq %%rcx, %%r14 ;" |
| 1008 | + "movq %%r14, 56(%0) ;" |
| 1009 | + |
| 1010 | + |
| 1011 | + "movq 32(%1), %%rdx ;" /* B[0] */ |
| 1012 | + "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ |
| 1013 | + "xorl %%r15d, %%r15d;" |
| 1014 | + "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ |
| 1015 | + "adcx %%r14, %%r9 ;" |
| 1016 | + "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ |
| 1017 | + "adcx %%rax, %%r10 ;" |
| 1018 | + "movq 56(%1), %%rdx ;" /* B[3] */ |
| 1019 | + "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ |
| 1020 | + "adcx %%rcx, %%r11 ;" |
| 1021 | + "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ |
| 1022 | + "adcx %%rax, %%rbx ;" |
| 1023 | + "movq 40(%1), %%rdx ;" /* B[1] */ |
| 1024 | + "adcx %%r15, %%r13 ;" |
| 1025 | + "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ |
| 1026 | + "movq $0, %%r14 ;" |
| 1027 | + /******************************************/ |
| 1028 | + "adcx %%r15, %%r14 ;" |
| 1029 | + |
| 1030 | + "xorl %%r15d, %%r15d;" |
| 1031 | + "adox %%rax, %%r10 ;" |
| 1032 | + "adcx %%r8, %%r8 ;" |
| 1033 | + "adox %%rcx, %%r11 ;" |
| 1034 | + "adcx %%r9, %%r9 ;" |
| 1035 | + "adox %%r15, %%rbx ;" |
| 1036 | + "adcx %%r10, %%r10 ;" |
| 1037 | + "adox %%r15, %%r13 ;" |
| 1038 | + "adcx %%r11, %%r11 ;" |
| 1039 | + "adox %%r15, %%r14 ;" |
| 1040 | + "adcx %%rbx, %%rbx ;" |
| 1041 | + "adcx %%r13, %%r13 ;" |
| 1042 | + "adcx %%r14, %%r14 ;" |
| 1043 | + |
| 1044 | + "movq 32(%1), %%rdx ;" |
| 1045 | + "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ |
| 1046 | + /*******************/ |
| 1047 | + "movq %%rax, 64(%0) ;" |
| 1048 | + "addq %%rcx, %%r8 ;" |
| 1049 | + "movq %%r8, 72(%0) ;" |
| 1050 | + "movq 40(%1), %%rdx ;" |
| 1051 | + "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ |
| 1052 | + "adcq %%rax, %%r9 ;" |
| 1053 | + "movq %%r9, 80(%0) ;" |
| 1054 | + "adcq %%rcx, %%r10 ;" |
| 1055 | + "movq %%r10, 88(%0) ;" |
| 1056 | + "movq 48(%1), %%rdx ;" |
| 1057 | + "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ |
| 1058 | + "adcq %%rax, %%r11 ;" |
| 1059 | + "movq %%r11, 96(%0) ;" |
| 1060 | + "adcq %%rcx, %%rbx ;" |
| 1061 | + "movq %%rbx, 104(%0) ;" |
| 1062 | + "movq 56(%1), %%rdx ;" |
| 1063 | + "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ |
| 1064 | + "adcq %%rax, %%r13 ;" |
| 1065 | + "movq %%r13, 112(%0) ;" |
| 1066 | + "adcq %%rcx, %%r14 ;" |
| 1067 | + "movq %%r14, 120(%0) ;" |
| 1068 | + : |
| 1069 | + : "r"(c), "r"(a) |
| 1070 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 1071 | + "%r10", "%r11", "%r13", "%r14", "%r15"); |
| 1072 | +} |
| 1073 | + |
| 1074 | +static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) |
| 1075 | +{ |
| 1076 | + asm volatile( |
| 1077 | + "movq 8(%1), %%rdx ;" /* A[1] */ |
| 1078 | + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ |
| 1079 | + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ |
| 1080 | + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ |
| 1081 | + |
| 1082 | + "movq 16(%1), %%rdx ;" /* A[2] */ |
| 1083 | + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ |
| 1084 | + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ |
| 1085 | + |
| 1086 | + "addq %%rax, %%r9 ;" |
| 1087 | + "adcq %%rdx, %%r10 ;" |
| 1088 | + "adcq %%rcx, %%r11 ;" |
| 1089 | + "adcq %%r14, %%r15 ;" |
| 1090 | + "adcq $0, %%r13 ;" |
| 1091 | + "movq $0, %%r14 ;" |
| 1092 | + "adcq $0, %%r14 ;" |
| 1093 | + |
| 1094 | + "movq (%1), %%rdx ;" /* A[0] */ |
| 1095 | + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ |
| 1096 | + |
| 1097 | + "addq %%rax, %%r10 ;" |
| 1098 | + "adcq %%rcx, %%r11 ;" |
| 1099 | + "adcq $0, %%r15 ;" |
| 1100 | + "adcq $0, %%r13 ;" |
| 1101 | + "adcq $0, %%r14 ;" |
| 1102 | + |
| 1103 | + "shldq $1, %%r13, %%r14 ;" |
| 1104 | + "shldq $1, %%r15, %%r13 ;" |
| 1105 | + "shldq $1, %%r11, %%r15 ;" |
| 1106 | + "shldq $1, %%r10, %%r11 ;" |
| 1107 | + "shldq $1, %%r9, %%r10 ;" |
| 1108 | + "shldq $1, %%r8, %%r9 ;" |
| 1109 | + "shlq $1, %%r8 ;" |
| 1110 | + |
| 1111 | + /*******************/ |
| 1112 | + "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ |
| 1113 | + /*******************/ |
| 1114 | + "movq %%rax, 0(%0) ;" |
| 1115 | + "addq %%rcx, %%r8 ;" |
| 1116 | + "movq %%r8, 8(%0) ;" |
| 1117 | + "movq 8(%1), %%rdx ;" |
| 1118 | + "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ |
| 1119 | + "adcq %%rax, %%r9 ;" |
| 1120 | + "movq %%r9, 16(%0) ;" |
| 1121 | + "adcq %%rcx, %%r10 ;" |
| 1122 | + "movq %%r10, 24(%0) ;" |
| 1123 | + "movq 16(%1), %%rdx ;" |
| 1124 | + "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ |
| 1125 | + "adcq %%rax, %%r11 ;" |
| 1126 | + "movq %%r11, 32(%0) ;" |
| 1127 | + "adcq %%rcx, %%r15 ;" |
| 1128 | + "movq %%r15, 40(%0) ;" |
| 1129 | + "movq 24(%1), %%rdx ;" |
| 1130 | + "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ |
| 1131 | + "adcq %%rax, %%r13 ;" |
| 1132 | + "movq %%r13, 48(%0) ;" |
| 1133 | + "adcq %%rcx, %%r14 ;" |
| 1134 | + "movq %%r14, 56(%0) ;" |
| 1135 | + |
| 1136 | + "movq 40(%1), %%rdx ;" /* B[1] */ |
| 1137 | + "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ |
| 1138 | + "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ |
| 1139 | + "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ |
| 1140 | + |
| 1141 | + "movq 48(%1), %%rdx ;" /* B[2] */ |
| 1142 | + "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ |
| 1143 | + "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ |
| 1144 | + |
| 1145 | + "addq %%rax, %%r9 ;" |
| 1146 | + "adcq %%rdx, %%r10 ;" |
| 1147 | + "adcq %%rcx, %%r11 ;" |
| 1148 | + "adcq %%r14, %%r15 ;" |
| 1149 | + "adcq $0, %%r13 ;" |
| 1150 | + "movq $0, %%r14 ;" |
| 1151 | + "adcq $0, %%r14 ;" |
| 1152 | + |
| 1153 | + "movq 32(%1), %%rdx ;" /* B[0] */ |
| 1154 | + "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ |
| 1155 | + |
| 1156 | + "addq %%rax, %%r10 ;" |
| 1157 | + "adcq %%rcx, %%r11 ;" |
| 1158 | + "adcq $0, %%r15 ;" |
| 1159 | + "adcq $0, %%r13 ;" |
| 1160 | + "adcq $0, %%r14 ;" |
| 1161 | + |
| 1162 | + "shldq $1, %%r13, %%r14 ;" |
| 1163 | + "shldq $1, %%r15, %%r13 ;" |
| 1164 | + "shldq $1, %%r11, %%r15 ;" |
| 1165 | + "shldq $1, %%r10, %%r11 ;" |
| 1166 | + "shldq $1, %%r9, %%r10 ;" |
| 1167 | + "shldq $1, %%r8, %%r9 ;" |
| 1168 | + "shlq $1, %%r8 ;" |
| 1169 | + |
| 1170 | + /*******************/ |
| 1171 | + "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ |
| 1172 | + /*******************/ |
| 1173 | + "movq %%rax, 64(%0) ;" |
| 1174 | + "addq %%rcx, %%r8 ;" |
| 1175 | + "movq %%r8, 72(%0) ;" |
| 1176 | + "movq 40(%1), %%rdx ;" |
| 1177 | + "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ |
| 1178 | + "adcq %%rax, %%r9 ;" |
| 1179 | + "movq %%r9, 80(%0) ;" |
| 1180 | + "adcq %%rcx, %%r10 ;" |
| 1181 | + "movq %%r10, 88(%0) ;" |
| 1182 | + "movq 48(%1), %%rdx ;" |
| 1183 | + "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ |
| 1184 | + "adcq %%rax, %%r11 ;" |
| 1185 | + "movq %%r11, 96(%0) ;" |
| 1186 | + "adcq %%rcx, %%r15 ;" |
| 1187 | + "movq %%r15, 104(%0) ;" |
| 1188 | + "movq 56(%1), %%rdx ;" |
| 1189 | + "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ |
| 1190 | + "adcq %%rax, %%r13 ;" |
| 1191 | + "movq %%r13, 112(%0) ;" |
| 1192 | + "adcq %%rcx, %%r14 ;" |
| 1193 | + "movq %%r14, 120(%0) ;" |
| 1194 | + : |
| 1195 | + : "r"(c), "r"(a) |
| 1196 | + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", |
| 1197 | + "%r11", "%r13", "%r14", "%r15"); |
| 1198 | +} |
| 1199 | + |
| 1200 | +static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) |
| 1201 | +{ |
| 1202 | + asm volatile( |
| 1203 | + "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ |
| 1204 | + "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ |
| 1205 | + "xorl %%ebx, %%ebx ;" |
| 1206 | + "adox (%1), %%r8 ;" |
| 1207 | + "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ |
| 1208 | + "adcx %%r10, %%r9 ;" |
| 1209 | + "adox 8(%1), %%r9 ;" |
| 1210 | + "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ |
| 1211 | + "adcx %%r11, %%r10 ;" |
| 1212 | + "adox 16(%1), %%r10 ;" |
| 1213 | + "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ |
| 1214 | + "adcx %%rax, %%r11 ;" |
| 1215 | + "adox 24(%1), %%r11 ;" |
| 1216 | + /***************************************/ |
| 1217 | + "adcx %%rbx, %%rcx ;" |
| 1218 | + "adox %%rbx, %%rcx ;" |
| 1219 | + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ |
| 1220 | + "adcx %%rcx, %%r8 ;" |
| 1221 | + "adcx %%rbx, %%r9 ;" |
| 1222 | + "movq %%r9, 8(%0) ;" |
| 1223 | + "adcx %%rbx, %%r10 ;" |
| 1224 | + "movq %%r10, 16(%0) ;" |
| 1225 | + "adcx %%rbx, %%r11 ;" |
| 1226 | + "movq %%r11, 24(%0) ;" |
| 1227 | + "mov $0, %%ecx ;" |
| 1228 | + "cmovc %%edx, %%ecx ;" |
| 1229 | + "addq %%rcx, %%r8 ;" |
| 1230 | + "movq %%r8, (%0) ;" |
| 1231 | + |
| 1232 | + "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ |
| 1233 | + "xorl %%ebx, %%ebx ;" |
| 1234 | + "adox 64(%1), %%r8 ;" |
| 1235 | + "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ |
| 1236 | + "adcx %%r10, %%r9 ;" |
| 1237 | + "adox 72(%1), %%r9 ;" |
| 1238 | + "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ |
| 1239 | + "adcx %%r11, %%r10 ;" |
| 1240 | + "adox 80(%1), %%r10 ;" |
| 1241 | + "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ |
| 1242 | + "adcx %%rax, %%r11 ;" |
| 1243 | + "adox 88(%1), %%r11 ;" |
| 1244 | + /****************************************/ |
| 1245 | + "adcx %%rbx, %%rcx ;" |
| 1246 | + "adox %%rbx, %%rcx ;" |
| 1247 | + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ |
| 1248 | + "adcx %%rcx, %%r8 ;" |
| 1249 | + "adcx %%rbx, %%r9 ;" |
| 1250 | + "movq %%r9, 40(%0) ;" |
| 1251 | + "adcx %%rbx, %%r10 ;" |
| 1252 | + "movq %%r10, 48(%0) ;" |
| 1253 | + "adcx %%rbx, %%r11 ;" |
| 1254 | + "movq %%r11, 56(%0) ;" |
| 1255 | + "mov $0, %%ecx ;" |
| 1256 | + "cmovc %%edx, %%ecx ;" |
| 1257 | + "addq %%rcx, %%r8 ;" |
| 1258 | + "movq %%r8, 32(%0) ;" |
| 1259 | + : |
| 1260 | + : "r"(c), "r"(a) |
| 1261 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 1262 | + "%r10", "%r11"); |
| 1263 | +} |
| 1264 | + |
| 1265 | +static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) |
| 1266 | +{ |
| 1267 | + asm volatile( |
| 1268 | + "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ |
| 1269 | + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ |
| 1270 | + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ |
| 1271 | + "addq %%r10, %%r9 ;" |
| 1272 | + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ |
| 1273 | + "adcq %%r11, %%r10 ;" |
| 1274 | + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ |
| 1275 | + "adcq %%rax, %%r11 ;" |
| 1276 | + /***************************************/ |
| 1277 | + "adcq $0, %%rcx ;" |
| 1278 | + "addq (%1), %%r8 ;" |
| 1279 | + "adcq 8(%1), %%r9 ;" |
| 1280 | + "adcq 16(%1), %%r10 ;" |
| 1281 | + "adcq 24(%1), %%r11 ;" |
| 1282 | + "adcq $0, %%rcx ;" |
| 1283 | + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ |
| 1284 | + "addq %%rcx, %%r8 ;" |
| 1285 | + "adcq $0, %%r9 ;" |
| 1286 | + "movq %%r9, 8(%0) ;" |
| 1287 | + "adcq $0, %%r10 ;" |
| 1288 | + "movq %%r10, 16(%0) ;" |
| 1289 | + "adcq $0, %%r11 ;" |
| 1290 | + "movq %%r11, 24(%0) ;" |
| 1291 | + "mov $0, %%ecx ;" |
| 1292 | + "cmovc %%edx, %%ecx ;" |
| 1293 | + "addq %%rcx, %%r8 ;" |
| 1294 | + "movq %%r8, (%0) ;" |
| 1295 | + |
| 1296 | + "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ |
| 1297 | + "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ |
| 1298 | + "addq %%r10, %%r9 ;" |
| 1299 | + "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ |
| 1300 | + "adcq %%r11, %%r10 ;" |
| 1301 | + "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ |
| 1302 | + "adcq %%rax, %%r11 ;" |
| 1303 | + /****************************************/ |
| 1304 | + "adcq $0, %%rcx ;" |
| 1305 | + "addq 64(%1), %%r8 ;" |
| 1306 | + "adcq 72(%1), %%r9 ;" |
| 1307 | + "adcq 80(%1), %%r10 ;" |
| 1308 | + "adcq 88(%1), %%r11 ;" |
| 1309 | + "adcq $0, %%rcx ;" |
| 1310 | + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ |
| 1311 | + "addq %%rcx, %%r8 ;" |
| 1312 | + "adcq $0, %%r9 ;" |
| 1313 | + "movq %%r9, 40(%0) ;" |
| 1314 | + "adcq $0, %%r10 ;" |
| 1315 | + "movq %%r10, 48(%0) ;" |
| 1316 | + "adcq $0, %%r11 ;" |
| 1317 | + "movq %%r11, 56(%0) ;" |
| 1318 | + "mov $0, %%ecx ;" |
| 1319 | + "cmovc %%edx, %%ecx ;" |
| 1320 | + "addq %%rcx, %%r8 ;" |
| 1321 | + "movq %%r8, 32(%0) ;" |
| 1322 | + : |
| 1323 | + : "r"(c), "r"(a) |
| 1324 | + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", |
| 1325 | + "%r11"); |
| 1326 | +} |
| 1327 | + |
| 1328 | +static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, |
| 1329 | + const u64 *const b) |
| 1330 | +{ |
| 1331 | + asm volatile( |
| 1332 | + "movq (%1), %%rdx; " /* A[0] */ |
| 1333 | + "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ |
| 1334 | + "xorl %%r10d, %%r10d ;" |
| 1335 | + "movq %%r8, (%0) ;" |
| 1336 | + "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ |
| 1337 | + "adox %%r9, %%r10 ;" |
| 1338 | + "movq %%r10, 8(%0) ;" |
| 1339 | + "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ |
| 1340 | + "adox %%r11, %%r15 ;" |
| 1341 | + "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ |
| 1342 | + "adox %%r13, %%r14 ;" |
| 1343 | + "movq $0, %%rax ;" |
| 1344 | + /******************************************/ |
| 1345 | + "adox %%rdx, %%rax ;" |
| 1346 | + |
| 1347 | + "movq 8(%1), %%rdx; " /* A[1] */ |
| 1348 | + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ |
| 1349 | + "xorl %%r10d, %%r10d ;" |
| 1350 | + "adcx 8(%0), %%r8 ;" |
| 1351 | + "movq %%r8, 8(%0) ;" |
| 1352 | + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ |
| 1353 | + "adox %%r9, %%r10 ;" |
| 1354 | + "adcx %%r15, %%r10 ;" |
| 1355 | + "movq %%r10, 16(%0) ;" |
| 1356 | + "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ |
| 1357 | + "adox %%r11, %%r15 ;" |
| 1358 | + "adcx %%r14, %%r15 ;" |
| 1359 | + "movq $0, %%r8 ;" |
| 1360 | + "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ |
| 1361 | + "adox %%r13, %%r14 ;" |
| 1362 | + "adcx %%rax, %%r14 ;" |
| 1363 | + "movq $0, %%rax ;" |
| 1364 | + /******************************************/ |
| 1365 | + "adox %%rdx, %%rax ;" |
| 1366 | + "adcx %%r8, %%rax ;" |
| 1367 | + |
| 1368 | + "movq 16(%1), %%rdx; " /* A[2] */ |
| 1369 | + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ |
| 1370 | + "xorl %%r10d, %%r10d ;" |
| 1371 | + "adcx 16(%0), %%r8 ;" |
| 1372 | + "movq %%r8, 16(%0) ;" |
| 1373 | + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ |
| 1374 | + "adox %%r9, %%r10 ;" |
| 1375 | + "adcx %%r15, %%r10 ;" |
| 1376 | + "movq %%r10, 24(%0) ;" |
| 1377 | + "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ |
| 1378 | + "adox %%r11, %%r15 ;" |
| 1379 | + "adcx %%r14, %%r15 ;" |
| 1380 | + "movq $0, %%r8 ;" |
| 1381 | + "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ |
| 1382 | + "adox %%r13, %%r14 ;" |
| 1383 | + "adcx %%rax, %%r14 ;" |
| 1384 | + "movq $0, %%rax ;" |
| 1385 | + /******************************************/ |
| 1386 | + "adox %%rdx, %%rax ;" |
| 1387 | + "adcx %%r8, %%rax ;" |
| 1388 | + |
| 1389 | + "movq 24(%1), %%rdx; " /* A[3] */ |
| 1390 | + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ |
| 1391 | + "xorl %%r10d, %%r10d ;" |
| 1392 | + "adcx 24(%0), %%r8 ;" |
| 1393 | + "movq %%r8, 24(%0) ;" |
| 1394 | + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ |
| 1395 | + "adox %%r9, %%r10 ;" |
| 1396 | + "adcx %%r15, %%r10 ;" |
| 1397 | + "movq %%r10, 32(%0) ;" |
| 1398 | + "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ |
| 1399 | + "adox %%r11, %%r15 ;" |
| 1400 | + "adcx %%r14, %%r15 ;" |
| 1401 | + "movq %%r15, 40(%0) ;" |
| 1402 | + "movq $0, %%r8 ;" |
| 1403 | + "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ |
| 1404 | + "adox %%r13, %%r14 ;" |
| 1405 | + "adcx %%rax, %%r14 ;" |
| 1406 | + "movq %%r14, 48(%0) ;" |
| 1407 | + "movq $0, %%rax ;" |
| 1408 | + /******************************************/ |
| 1409 | + "adox %%rdx, %%rax ;" |
| 1410 | + "adcx %%r8, %%rax ;" |
| 1411 | + "movq %%rax, 56(%0) ;" |
| 1412 | + : |
| 1413 | + : "r"(c), "r"(a), "r"(b) |
| 1414 | + : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", |
| 1415 | + "%r13", "%r14", "%r15"); |
| 1416 | +} |
| 1417 | + |
| 1418 | +static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, |
| 1419 | + const u64 *const b) |
| 1420 | +{ |
| 1421 | + asm volatile( |
| 1422 | + "movq (%1), %%rdx; " /* A[0] */ |
| 1423 | + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ |
| 1424 | + "movq %%r8, (%0) ;" |
| 1425 | + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ |
| 1426 | + "addq %%r10, %%r15 ;" |
| 1427 | + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ |
| 1428 | + "adcq %%r8, %%rax ;" |
| 1429 | + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ |
| 1430 | + "adcq %%r10, %%rbx ;" |
| 1431 | + /******************************************/ |
| 1432 | + "adcq $0, %%rcx ;" |
| 1433 | + |
| 1434 | + "movq 8(%1), %%rdx; " /* A[1] */ |
| 1435 | + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ |
| 1436 | + "addq %%r15, %%r8 ;" |
| 1437 | + "movq %%r8, 8(%0) ;" |
| 1438 | + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ |
| 1439 | + "adcq %%r10, %%r9 ;" |
| 1440 | + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ |
| 1441 | + "adcq %%r8, %%r11 ;" |
| 1442 | + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ |
| 1443 | + "adcq %%r10, %%r13 ;" |
| 1444 | + /******************************************/ |
| 1445 | + "adcq $0, %%r15 ;" |
| 1446 | + |
| 1447 | + "addq %%r9, %%rax ;" |
| 1448 | + "adcq %%r11, %%rbx ;" |
| 1449 | + "adcq %%r13, %%rcx ;" |
| 1450 | + "adcq $0, %%r15 ;" |
| 1451 | + |
| 1452 | + "movq 16(%1), %%rdx; " /* A[2] */ |
| 1453 | + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ |
| 1454 | + "addq %%rax, %%r8 ;" |
| 1455 | + "movq %%r8, 16(%0) ;" |
| 1456 | + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ |
| 1457 | + "adcq %%r10, %%r9 ;" |
| 1458 | + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ |
| 1459 | + "adcq %%r8, %%r11 ;" |
| 1460 | + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ |
| 1461 | + "adcq %%r10, %%r13 ;" |
| 1462 | + /******************************************/ |
| 1463 | + "adcq $0, %%rax ;" |
| 1464 | + |
| 1465 | + "addq %%r9, %%rbx ;" |
| 1466 | + "adcq %%r11, %%rcx ;" |
| 1467 | + "adcq %%r13, %%r15 ;" |
| 1468 | + "adcq $0, %%rax ;" |
| 1469 | + |
| 1470 | + "movq 24(%1), %%rdx; " /* A[3] */ |
| 1471 | + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ |
| 1472 | + "addq %%rbx, %%r8 ;" |
| 1473 | + "movq %%r8, 24(%0) ;" |
| 1474 | + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ |
| 1475 | + "adcq %%r10, %%r9 ;" |
| 1476 | + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ |
| 1477 | + "adcq %%r8, %%r11 ;" |
| 1478 | + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ |
| 1479 | + "adcq %%r10, %%r13 ;" |
| 1480 | + /******************************************/ |
| 1481 | + "adcq $0, %%rbx ;" |
| 1482 | + |
| 1483 | + "addq %%r9, %%rcx ;" |
| 1484 | + "movq %%rcx, 32(%0) ;" |
| 1485 | + "adcq %%r11, %%r15 ;" |
| 1486 | + "movq %%r15, 40(%0) ;" |
| 1487 | + "adcq %%r13, %%rax ;" |
| 1488 | + "movq %%rax, 48(%0) ;" |
| 1489 | + "adcq $0, %%rbx ;" |
| 1490 | + "movq %%rbx, 56(%0) ;" |
| 1491 | + : |
| 1492 | + : "r"(c), "r"(a), "r"(b) |
| 1493 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 1494 | + "%r10", "%r11", "%r13", "%r15"); |
| 1495 | +} |
| 1496 | + |
| 1497 | +static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) |
| 1498 | +{ |
| 1499 | + asm volatile( |
| 1500 | + "movq (%1), %%rdx ;" /* A[0] */ |
| 1501 | + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ |
| 1502 | + "xorl %%r15d, %%r15d;" |
| 1503 | + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ |
| 1504 | + "adcx %%r14, %%r9 ;" |
| 1505 | + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ |
| 1506 | + "adcx %%rax, %%r10 ;" |
| 1507 | + "movq 24(%1), %%rdx ;" /* A[3] */ |
| 1508 | + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ |
| 1509 | + "adcx %%rcx, %%r11 ;" |
| 1510 | + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ |
| 1511 | + "adcx %%rax, %%rbx ;" |
| 1512 | + "movq 8(%1), %%rdx ;" /* A[1] */ |
| 1513 | + "adcx %%r15, %%r13 ;" |
| 1514 | + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ |
| 1515 | + "movq $0, %%r14 ;" |
| 1516 | + /******************************************/ |
| 1517 | + "adcx %%r15, %%r14 ;" |
| 1518 | + |
| 1519 | + "xorl %%r15d, %%r15d;" |
| 1520 | + "adox %%rax, %%r10 ;" |
| 1521 | + "adcx %%r8, %%r8 ;" |
| 1522 | + "adox %%rcx, %%r11 ;" |
| 1523 | + "adcx %%r9, %%r9 ;" |
| 1524 | + "adox %%r15, %%rbx ;" |
| 1525 | + "adcx %%r10, %%r10 ;" |
| 1526 | + "adox %%r15, %%r13 ;" |
| 1527 | + "adcx %%r11, %%r11 ;" |
| 1528 | + "adox %%r15, %%r14 ;" |
| 1529 | + "adcx %%rbx, %%rbx ;" |
| 1530 | + "adcx %%r13, %%r13 ;" |
| 1531 | + "adcx %%r14, %%r14 ;" |
| 1532 | + |
| 1533 | + "movq (%1), %%rdx ;" |
| 1534 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ |
| 1535 | + /*******************/ |
| 1536 | + "movq %%rax, 0(%0) ;" |
| 1537 | + "addq %%rcx, %%r8 ;" |
| 1538 | + "movq %%r8, 8(%0) ;" |
| 1539 | + "movq 8(%1), %%rdx ;" |
| 1540 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ |
| 1541 | + "adcq %%rax, %%r9 ;" |
| 1542 | + "movq %%r9, 16(%0) ;" |
| 1543 | + "adcq %%rcx, %%r10 ;" |
| 1544 | + "movq %%r10, 24(%0) ;" |
| 1545 | + "movq 16(%1), %%rdx ;" |
| 1546 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ |
| 1547 | + "adcq %%rax, %%r11 ;" |
| 1548 | + "movq %%r11, 32(%0) ;" |
| 1549 | + "adcq %%rcx, %%rbx ;" |
| 1550 | + "movq %%rbx, 40(%0) ;" |
| 1551 | + "movq 24(%1), %%rdx ;" |
| 1552 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ |
| 1553 | + "adcq %%rax, %%r13 ;" |
| 1554 | + "movq %%r13, 48(%0) ;" |
| 1555 | + "adcq %%rcx, %%r14 ;" |
| 1556 | + "movq %%r14, 56(%0) ;" |
| 1557 | + : |
| 1558 | + : "r"(c), "r"(a) |
| 1559 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 1560 | + "%r10", "%r11", "%r13", "%r14", "%r15"); |
| 1561 | +} |
| 1562 | + |
| 1563 | +static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) |
| 1564 | +{ |
| 1565 | + asm volatile( |
| 1566 | + "movq 8(%1), %%rdx ;" /* A[1] */ |
| 1567 | + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ |
| 1568 | + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ |
| 1569 | + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ |
| 1570 | + |
| 1571 | + "movq 16(%1), %%rdx ;" /* A[2] */ |
| 1572 | + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ |
| 1573 | + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ |
| 1574 | + |
| 1575 | + "addq %%rax, %%r9 ;" |
| 1576 | + "adcq %%rdx, %%r10 ;" |
| 1577 | + "adcq %%rcx, %%r11 ;" |
| 1578 | + "adcq %%r14, %%r15 ;" |
| 1579 | + "adcq $0, %%r13 ;" |
| 1580 | + "movq $0, %%r14 ;" |
| 1581 | + "adcq $0, %%r14 ;" |
| 1582 | + |
| 1583 | + "movq (%1), %%rdx ;" /* A[0] */ |
| 1584 | + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ |
| 1585 | + |
| 1586 | + "addq %%rax, %%r10 ;" |
| 1587 | + "adcq %%rcx, %%r11 ;" |
| 1588 | + "adcq $0, %%r15 ;" |
| 1589 | + "adcq $0, %%r13 ;" |
| 1590 | + "adcq $0, %%r14 ;" |
| 1591 | + |
| 1592 | + "shldq $1, %%r13, %%r14 ;" |
| 1593 | + "shldq $1, %%r15, %%r13 ;" |
| 1594 | + "shldq $1, %%r11, %%r15 ;" |
| 1595 | + "shldq $1, %%r10, %%r11 ;" |
| 1596 | + "shldq $1, %%r9, %%r10 ;" |
| 1597 | + "shldq $1, %%r8, %%r9 ;" |
| 1598 | + "shlq $1, %%r8 ;" |
| 1599 | + |
| 1600 | + /*******************/ |
| 1601 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ |
| 1602 | + /*******************/ |
| 1603 | + "movq %%rax, 0(%0) ;" |
| 1604 | + "addq %%rcx, %%r8 ;" |
| 1605 | + "movq %%r8, 8(%0) ;" |
| 1606 | + "movq 8(%1), %%rdx ;" |
| 1607 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ |
| 1608 | + "adcq %%rax, %%r9 ;" |
| 1609 | + "movq %%r9, 16(%0) ;" |
| 1610 | + "adcq %%rcx, %%r10 ;" |
| 1611 | + "movq %%r10, 24(%0) ;" |
| 1612 | + "movq 16(%1), %%rdx ;" |
| 1613 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ |
| 1614 | + "adcq %%rax, %%r11 ;" |
| 1615 | + "movq %%r11, 32(%0) ;" |
| 1616 | + "adcq %%rcx, %%r15 ;" |
| 1617 | + "movq %%r15, 40(%0) ;" |
| 1618 | + "movq 24(%1), %%rdx ;" |
| 1619 | + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ |
| 1620 | + "adcq %%rax, %%r13 ;" |
| 1621 | + "movq %%r13, 48(%0) ;" |
| 1622 | + "adcq %%rcx, %%r14 ;" |
| 1623 | + "movq %%r14, 56(%0) ;" |
| 1624 | + : |
| 1625 | + : "r"(c), "r"(a) |
| 1626 | + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", |
| 1627 | + "%r11", "%r13", "%r14", "%r15"); |
| 1628 | +} |
| 1629 | + |
| 1630 | +static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) |
| 1631 | +{ |
| 1632 | + asm volatile( |
| 1633 | + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ |
| 1634 | + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ |
| 1635 | + "xorl %%ebx, %%ebx ;" |
| 1636 | + "adox (%1), %%r8 ;" |
| 1637 | + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ |
| 1638 | + "adcx %%r10, %%r9 ;" |
| 1639 | + "adox 8(%1), %%r9 ;" |
| 1640 | + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ |
| 1641 | + "adcx %%r11, %%r10 ;" |
| 1642 | + "adox 16(%1), %%r10 ;" |
| 1643 | + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ |
| 1644 | + "adcx %%rax, %%r11 ;" |
| 1645 | + "adox 24(%1), %%r11 ;" |
| 1646 | + /***************************************/ |
| 1647 | + "adcx %%rbx, %%rcx ;" |
| 1648 | + "adox %%rbx, %%rcx ;" |
| 1649 | + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ |
| 1650 | + "adcx %%rcx, %%r8 ;" |
| 1651 | + "adcx %%rbx, %%r9 ;" |
| 1652 | + "movq %%r9, 8(%0) ;" |
| 1653 | + "adcx %%rbx, %%r10 ;" |
| 1654 | + "movq %%r10, 16(%0) ;" |
| 1655 | + "adcx %%rbx, %%r11 ;" |
| 1656 | + "movq %%r11, 24(%0) ;" |
| 1657 | + "mov $0, %%ecx ;" |
| 1658 | + "cmovc %%edx, %%ecx ;" |
| 1659 | + "addq %%rcx, %%r8 ;" |
| 1660 | + "movq %%r8, (%0) ;" |
| 1661 | + : |
| 1662 | + : "r"(c), "r"(a) |
| 1663 | + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", |
| 1664 | + "%r10", "%r11"); |
| 1665 | +} |
| 1666 | + |
| 1667 | +static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) |
| 1668 | +{ |
| 1669 | + asm volatile( |
| 1670 | + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ |
| 1671 | + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ |
| 1672 | + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ |
| 1673 | + "addq %%r10, %%r9 ;" |
| 1674 | + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ |
| 1675 | + "adcq %%r11, %%r10 ;" |
| 1676 | + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ |
| 1677 | + "adcq %%rax, %%r11 ;" |
| 1678 | + /***************************************/ |
| 1679 | + "adcq $0, %%rcx ;" |
| 1680 | + "addq (%1), %%r8 ;" |
| 1681 | + "adcq 8(%1), %%r9 ;" |
| 1682 | + "adcq 16(%1), %%r10 ;" |
| 1683 | + "adcq 24(%1), %%r11 ;" |
| 1684 | + "adcq $0, %%rcx ;" |
| 1685 | + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ |
| 1686 | + "addq %%rcx, %%r8 ;" |
| 1687 | + "adcq $0, %%r9 ;" |
| 1688 | + "movq %%r9, 8(%0) ;" |
| 1689 | + "adcq $0, %%r10 ;" |
| 1690 | + "movq %%r10, 16(%0) ;" |
| 1691 | + "adcq $0, %%r11 ;" |
| 1692 | + "movq %%r11, 24(%0) ;" |
| 1693 | + "mov $0, %%ecx ;" |
| 1694 | + "cmovc %%edx, %%ecx ;" |
| 1695 | + "addq %%rcx, %%r8 ;" |
| 1696 | + "movq %%r8, (%0) ;" |
| 1697 | + : |
| 1698 | + : "r"(c), "r"(a) |
| 1699 | + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", |
| 1700 | + "%r11"); |
| 1701 | +} |
| 1702 | + |
| 1703 | +static __always_inline void |
| 1704 | +add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) |
| 1705 | +{ |
| 1706 | + asm volatile( |
| 1707 | + "mov $38, %%eax ;" |
| 1708 | + "xorl %%ecx, %%ecx ;" |
| 1709 | + "movq (%2), %%r8 ;" |
| 1710 | + "adcx (%1), %%r8 ;" |
| 1711 | + "movq 8(%2), %%r9 ;" |
| 1712 | + "adcx 8(%1), %%r9 ;" |
| 1713 | + "movq 16(%2), %%r10 ;" |
| 1714 | + "adcx 16(%1), %%r10 ;" |
| 1715 | + "movq 24(%2), %%r11 ;" |
| 1716 | + "adcx 24(%1), %%r11 ;" |
| 1717 | + "cmovc %%eax, %%ecx ;" |
| 1718 | + "xorl %%eax, %%eax ;" |
| 1719 | + "adcx %%rcx, %%r8 ;" |
| 1720 | + "adcx %%rax, %%r9 ;" |
| 1721 | + "movq %%r9, 8(%0) ;" |
| 1722 | + "adcx %%rax, %%r10 ;" |
| 1723 | + "movq %%r10, 16(%0) ;" |
| 1724 | + "adcx %%rax, %%r11 ;" |
| 1725 | + "movq %%r11, 24(%0) ;" |
| 1726 | + "mov $38, %%ecx ;" |
| 1727 | + "cmovc %%ecx, %%eax ;" |
| 1728 | + "addq %%rax, %%r8 ;" |
| 1729 | + "movq %%r8, (%0) ;" |
| 1730 | + : |
| 1731 | + : "r"(c), "r"(a), "r"(b) |
| 1732 | + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); |
| 1733 | +} |
| 1734 | + |
| 1735 | +static __always_inline void |
| 1736 | +add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) |
| 1737 | +{ |
| 1738 | + asm volatile( |
| 1739 | + "mov $38, %%eax ;" |
| 1740 | + "movq (%2), %%r8 ;" |
| 1741 | + "addq (%1), %%r8 ;" |
| 1742 | + "movq 8(%2), %%r9 ;" |
| 1743 | + "adcq 8(%1), %%r9 ;" |
| 1744 | + "movq 16(%2), %%r10 ;" |
| 1745 | + "adcq 16(%1), %%r10 ;" |
| 1746 | + "movq 24(%2), %%r11 ;" |
| 1747 | + "adcq 24(%1), %%r11 ;" |
| 1748 | + "mov $0, %%ecx ;" |
| 1749 | + "cmovc %%eax, %%ecx ;" |
| 1750 | + "addq %%rcx, %%r8 ;" |
| 1751 | + "adcq $0, %%r9 ;" |
| 1752 | + "movq %%r9, 8(%0) ;" |
| 1753 | + "adcq $0, %%r10 ;" |
| 1754 | + "movq %%r10, 16(%0) ;" |
| 1755 | + "adcq $0, %%r11 ;" |
| 1756 | + "movq %%r11, 24(%0) ;" |
| 1757 | + "mov $0, %%ecx ;" |
| 1758 | + "cmovc %%eax, %%ecx ;" |
| 1759 | + "addq %%rcx, %%r8 ;" |
| 1760 | + "movq %%r8, (%0) ;" |
| 1761 | + : |
| 1762 | + : "r"(c), "r"(a), "r"(b) |
| 1763 | + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); |
| 1764 | +} |
| 1765 | + |
| 1766 | +static __always_inline void |
| 1767 | +sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) |
| 1768 | +{ |
| 1769 | + asm volatile( |
| 1770 | + "mov $38, %%eax ;" |
| 1771 | + "movq (%1), %%r8 ;" |
| 1772 | + "subq (%2), %%r8 ;" |
| 1773 | + "movq 8(%1), %%r9 ;" |
| 1774 | + "sbbq 8(%2), %%r9 ;" |
| 1775 | + "movq 16(%1), %%r10 ;" |
| 1776 | + "sbbq 16(%2), %%r10 ;" |
| 1777 | + "movq 24(%1), %%r11 ;" |
| 1778 | + "sbbq 24(%2), %%r11 ;" |
| 1779 | + "mov $0, %%ecx ;" |
| 1780 | + "cmovc %%eax, %%ecx ;" |
| 1781 | + "subq %%rcx, %%r8 ;" |
| 1782 | + "sbbq $0, %%r9 ;" |
| 1783 | + "movq %%r9, 8(%0) ;" |
| 1784 | + "sbbq $0, %%r10 ;" |
| 1785 | + "movq %%r10, 16(%0) ;" |
| 1786 | + "sbbq $0, %%r11 ;" |
| 1787 | + "movq %%r11, 24(%0) ;" |
| 1788 | + "mov $0, %%ecx ;" |
| 1789 | + "cmovc %%eax, %%ecx ;" |
| 1790 | + "subq %%rcx, %%r8 ;" |
| 1791 | + "movq %%r8, (%0) ;" |
| 1792 | + : |
| 1793 | + : "r"(c), "r"(a), "r"(b) |
| 1794 | + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); |
| 1795 | +} |
| 1796 | + |
| 1797 | +/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ |
| 1798 | +static __always_inline void |
| 1799 | +mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) |
| 1800 | +{ |
| 1801 | + const u64 a24 = 121666; |
| 1802 | + asm volatile( |
| 1803 | + "movq %2, %%rdx ;" |
| 1804 | + "mulx (%1), %%r8, %%r10 ;" |
| 1805 | + "mulx 8(%1), %%r9, %%r11 ;" |
| 1806 | + "addq %%r10, %%r9 ;" |
| 1807 | + "mulx 16(%1), %%r10, %%rax ;" |
| 1808 | + "adcq %%r11, %%r10 ;" |
| 1809 | + "mulx 24(%1), %%r11, %%rcx ;" |
| 1810 | + "adcq %%rax, %%r11 ;" |
| 1811 | + /**************************/ |
| 1812 | + "adcq $0, %%rcx ;" |
| 1813 | + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ |
| 1814 | + "imul %%rdx, %%rcx ;" |
| 1815 | + "addq %%rcx, %%r8 ;" |
| 1816 | + "adcq $0, %%r9 ;" |
| 1817 | + "movq %%r9, 8(%0) ;" |
| 1818 | + "adcq $0, %%r10 ;" |
| 1819 | + "movq %%r10, 16(%0) ;" |
| 1820 | + "adcq $0, %%r11 ;" |
| 1821 | + "movq %%r11, 24(%0) ;" |
| 1822 | + "mov $0, %%ecx ;" |
| 1823 | + "cmovc %%edx, %%ecx ;" |
| 1824 | + "addq %%rcx, %%r8 ;" |
| 1825 | + "movq %%r8, (%0) ;" |
| 1826 | + : |
| 1827 | + : "r"(c), "r"(a), "r"(a24) |
| 1828 | + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", |
| 1829 | + "%r11"); |
| 1830 | +} |
| 1831 | + |
| 1832 | +static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) |
| 1833 | +{ |
| 1834 | + struct { |
| 1835 | + eltfp25519_1w_buffer buffer; |
| 1836 | + eltfp25519_1w x0, x1, x2; |
| 1837 | + } __aligned(32) m; |
| 1838 | + u64 *T[4]; |
| 1839 | + |
| 1840 | + T[0] = m.x0; |
| 1841 | + T[1] = c; /* x^(-1) */ |
| 1842 | + T[2] = m.x1; |
| 1843 | + T[3] = m.x2; |
| 1844 | + |
| 1845 | + copy_eltfp25519_1w(T[1], a); |
| 1846 | + sqrn_eltfp25519_1w_adx(T[1], 1); |
| 1847 | + copy_eltfp25519_1w(T[2], T[1]); |
| 1848 | + sqrn_eltfp25519_1w_adx(T[2], 2); |
| 1849 | + mul_eltfp25519_1w_adx(T[0], a, T[2]); |
| 1850 | + mul_eltfp25519_1w_adx(T[1], T[1], T[0]); |
| 1851 | + copy_eltfp25519_1w(T[2], T[1]); |
| 1852 | + sqrn_eltfp25519_1w_adx(T[2], 1); |
| 1853 | + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); |
| 1854 | + copy_eltfp25519_1w(T[2], T[0]); |
| 1855 | + sqrn_eltfp25519_1w_adx(T[2], 5); |
| 1856 | + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); |
| 1857 | + copy_eltfp25519_1w(T[2], T[0]); |
| 1858 | + sqrn_eltfp25519_1w_adx(T[2], 10); |
| 1859 | + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); |
| 1860 | + copy_eltfp25519_1w(T[3], T[2]); |
| 1861 | + sqrn_eltfp25519_1w_adx(T[3], 20); |
| 1862 | + mul_eltfp25519_1w_adx(T[3], T[3], T[2]); |
| 1863 | + sqrn_eltfp25519_1w_adx(T[3], 10); |
| 1864 | + mul_eltfp25519_1w_adx(T[3], T[3], T[0]); |
| 1865 | + copy_eltfp25519_1w(T[0], T[3]); |
| 1866 | + sqrn_eltfp25519_1w_adx(T[0], 50); |
| 1867 | + mul_eltfp25519_1w_adx(T[0], T[0], T[3]); |
| 1868 | + copy_eltfp25519_1w(T[2], T[0]); |
| 1869 | + sqrn_eltfp25519_1w_adx(T[2], 100); |
| 1870 | + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); |
| 1871 | + sqrn_eltfp25519_1w_adx(T[2], 50); |
| 1872 | + mul_eltfp25519_1w_adx(T[2], T[2], T[3]); |
| 1873 | + sqrn_eltfp25519_1w_adx(T[2], 5); |
| 1874 | + mul_eltfp25519_1w_adx(T[1], T[1], T[2]); |
| 1875 | + |
| 1876 | + memzero_explicit(&m, sizeof(m)); |
| 1877 | +} |
| 1878 | + |
| 1879 | +static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) |
| 1880 | +{ |
| 1881 | + struct { |
| 1882 | + eltfp25519_1w_buffer buffer; |
| 1883 | + eltfp25519_1w x0, x1, x2; |
| 1884 | + } __aligned(32) m; |
| 1885 | + u64 *T[5]; |
| 1886 | + |
| 1887 | + T[0] = m.x0; |
| 1888 | + T[1] = c; /* x^(-1) */ |
| 1889 | + T[2] = m.x1; |
| 1890 | + T[3] = m.x2; |
| 1891 | + |
| 1892 | + copy_eltfp25519_1w(T[1], a); |
| 1893 | + sqrn_eltfp25519_1w_bmi2(T[1], 1); |
| 1894 | + copy_eltfp25519_1w(T[2], T[1]); |
| 1895 | + sqrn_eltfp25519_1w_bmi2(T[2], 2); |
| 1896 | + mul_eltfp25519_1w_bmi2(T[0], a, T[2]); |
| 1897 | + mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); |
| 1898 | + copy_eltfp25519_1w(T[2], T[1]); |
| 1899 | + sqrn_eltfp25519_1w_bmi2(T[2], 1); |
| 1900 | + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); |
| 1901 | + copy_eltfp25519_1w(T[2], T[0]); |
| 1902 | + sqrn_eltfp25519_1w_bmi2(T[2], 5); |
| 1903 | + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); |
| 1904 | + copy_eltfp25519_1w(T[2], T[0]); |
| 1905 | + sqrn_eltfp25519_1w_bmi2(T[2], 10); |
| 1906 | + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); |
| 1907 | + copy_eltfp25519_1w(T[3], T[2]); |
| 1908 | + sqrn_eltfp25519_1w_bmi2(T[3], 20); |
| 1909 | + mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); |
| 1910 | + sqrn_eltfp25519_1w_bmi2(T[3], 10); |
| 1911 | + mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); |
| 1912 | + copy_eltfp25519_1w(T[0], T[3]); |
| 1913 | + sqrn_eltfp25519_1w_bmi2(T[0], 50); |
| 1914 | + mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); |
| 1915 | + copy_eltfp25519_1w(T[2], T[0]); |
| 1916 | + sqrn_eltfp25519_1w_bmi2(T[2], 100); |
| 1917 | + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); |
| 1918 | + sqrn_eltfp25519_1w_bmi2(T[2], 50); |
| 1919 | + mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); |
| 1920 | + sqrn_eltfp25519_1w_bmi2(T[2], 5); |
| 1921 | + mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); |
| 1922 | + |
| 1923 | + memzero_explicit(&m, sizeof(m)); |
| 1924 | +} |
| 1925 | + |
| 1926 | +/* Given c, a 256-bit number, fred_eltfp25519_1w updates c |
| 1927 | + * with a number such that 0 <= C < 2**255-19. |
| 1928 | + */ |
| 1929 | +static __always_inline void fred_eltfp25519_1w(u64 *const c) |
| 1930 | +{ |
| 1931 | + u64 tmp0 = 38, tmp1 = 19; |
| 1932 | + asm volatile( |
| 1933 | + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ |
| 1934 | + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ |
| 1935 | + |
| 1936 | + /* Add either 19 or 38 to c */ |
| 1937 | + "addq %4, %0 ;" |
| 1938 | + "adcq $0, %1 ;" |
| 1939 | + "adcq $0, %2 ;" |
| 1940 | + "adcq $0, %3 ;" |
| 1941 | + |
| 1942 | + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ |
| 1943 | + "movl $0, %k4 ;" |
| 1944 | + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ |
| 1945 | + "btrq $63, %3 ;" /* Clear bit 255 */ |
| 1946 | + |
| 1947 | + /* Subtract 19 if necessary */ |
| 1948 | + "subq %4, %0 ;" |
| 1949 | + "sbbq $0, %1 ;" |
| 1950 | + "sbbq $0, %2 ;" |
| 1951 | + "sbbq $0, %3 ;" |
| 1952 | + |
| 1953 | + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), |
| 1954 | + "+r"(tmp1) |
| 1955 | + : |
| 1956 | + : "memory", "cc"); |
| 1957 | +} |
| 1958 | + |
| 1959 | +static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) |
| 1960 | +{ |
| 1961 | + u64 temp; |
| 1962 | + asm volatile( |
| 1963 | + "test %9, %9 ;" |
| 1964 | + "movq %0, %8 ;" |
| 1965 | + "cmovnzq %4, %0 ;" |
| 1966 | + "cmovnzq %8, %4 ;" |
| 1967 | + "movq %1, %8 ;" |
| 1968 | + "cmovnzq %5, %1 ;" |
| 1969 | + "cmovnzq %8, %5 ;" |
| 1970 | + "movq %2, %8 ;" |
| 1971 | + "cmovnzq %6, %2 ;" |
| 1972 | + "cmovnzq %8, %6 ;" |
| 1973 | + "movq %3, %8 ;" |
| 1974 | + "cmovnzq %7, %3 ;" |
| 1975 | + "cmovnzq %8, %7 ;" |
| 1976 | + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), |
| 1977 | + "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), |
| 1978 | + "=r"(temp) |
| 1979 | + : "r"(bit) |
| 1980 | + : "cc" |
| 1981 | + ); |
| 1982 | +} |
| 1983 | + |
| 1984 | +static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) |
| 1985 | +{ |
| 1986 | + asm volatile( |
| 1987 | + "test %4, %4 ;" |
| 1988 | + "cmovnzq %5, %0 ;" |
| 1989 | + "cmovnzq %6, %1 ;" |
| 1990 | + "cmovnzq %7, %2 ;" |
| 1991 | + "cmovnzq %8, %3 ;" |
| 1992 | + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) |
| 1993 | + : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) |
| 1994 | + : "cc" |
| 1995 | + ); |
| 1996 | +} |
| 1997 | + |
| 1998 | +static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], |
| 1999 | + const u8 private_key[CURVE25519_KEY_SIZE], |
| 2000 | + const u8 session_key[CURVE25519_KEY_SIZE]) |
| 2001 | +{ |
| 2002 | + struct { |
| 2003 | + u64 buffer[4 * NUM_WORDS_ELTFP25519]; |
| 2004 | + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; |
| 2005 | + u64 workspace[6 * NUM_WORDS_ELTFP25519]; |
| 2006 | + u8 session[CURVE25519_KEY_SIZE]; |
| 2007 | + u8 private[CURVE25519_KEY_SIZE]; |
| 2008 | + } __aligned(32) m; |
| 2009 | + |
| 2010 | + int i = 0, j = 0; |
| 2011 | + u64 prev = 0; |
| 2012 | + u64 *const X1 = (u64 *)m.session; |
| 2013 | + u64 *const key = (u64 *)m.private; |
| 2014 | + u64 *const Px = m.coordinates + 0; |
| 2015 | + u64 *const Pz = m.coordinates + 4; |
| 2016 | + u64 *const Qx = m.coordinates + 8; |
| 2017 | + u64 *const Qz = m.coordinates + 12; |
| 2018 | + u64 *const X2 = Qx; |
| 2019 | + u64 *const Z2 = Qz; |
| 2020 | + u64 *const X3 = Px; |
| 2021 | + u64 *const Z3 = Pz; |
| 2022 | + u64 *const X2Z2 = Qx; |
| 2023 | + u64 *const X3Z3 = Px; |
| 2024 | + |
| 2025 | + u64 *const A = m.workspace + 0; |
| 2026 | + u64 *const B = m.workspace + 4; |
| 2027 | + u64 *const D = m.workspace + 8; |
| 2028 | + u64 *const C = m.workspace + 12; |
| 2029 | + u64 *const DA = m.workspace + 16; |
| 2030 | + u64 *const CB = m.workspace + 20; |
| 2031 | + u64 *const AB = A; |
| 2032 | + u64 *const DC = D; |
| 2033 | + u64 *const DACB = DA; |
| 2034 | + |
| 2035 | + memcpy(m.private, private_key, sizeof(m.private)); |
| 2036 | + memcpy(m.session, session_key, sizeof(m.session)); |
| 2037 | + |
| 2038 | + curve25519_clamp_secret(m.private); |
| 2039 | + |
| 2040 | + /* As in the draft: |
| 2041 | + * When receiving such an array, implementations of curve25519 |
| 2042 | + * MUST mask the most-significant bit in the final byte. This |
| 2043 | + * is done to preserve compatibility with point formats which |
| 2044 | + * reserve the sign bit for use in other protocols and to |
| 2045 | + * increase resistance to implementation fingerprinting |
| 2046 | + */ |
| 2047 | + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; |
| 2048 | + |
| 2049 | + copy_eltfp25519_1w(Px, X1); |
| 2050 | + setzero_eltfp25519_1w(Pz); |
| 2051 | + setzero_eltfp25519_1w(Qx); |
| 2052 | + setzero_eltfp25519_1w(Qz); |
| 2053 | + |
| 2054 | + Pz[0] = 1; |
| 2055 | + Qx[0] = 1; |
| 2056 | + |
| 2057 | + /* main-loop */ |
| 2058 | + prev = 0; |
| 2059 | + j = 62; |
| 2060 | + for (i = 3; i >= 0; --i) { |
| 2061 | + while (j >= 0) { |
| 2062 | + u64 bit = (key[i] >> j) & 0x1; |
| 2063 | + u64 swap = bit ^ prev; |
| 2064 | + prev = bit; |
| 2065 | + |
| 2066 | + add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ |
| 2067 | + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ |
| 2068 | + add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ |
| 2069 | + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ |
| 2070 | + mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ |
| 2071 | + |
| 2072 | + cselect(swap, A, C); |
| 2073 | + cselect(swap, B, D); |
| 2074 | + |
| 2075 | + sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ |
| 2076 | + add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ |
| 2077 | + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ |
| 2078 | + sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ |
| 2079 | + |
| 2080 | + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ |
| 2081 | + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ |
| 2082 | + |
| 2083 | + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ |
| 2084 | + add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ |
| 2085 | + mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ |
| 2086 | + mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ |
| 2087 | + --j; |
| 2088 | + } |
| 2089 | + j = 63; |
| 2090 | + } |
| 2091 | + |
| 2092 | + inv_eltfp25519_1w_adx(A, Qz); |
| 2093 | + mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); |
| 2094 | + fred_eltfp25519_1w((u64 *)shared); |
| 2095 | + |
| 2096 | + memzero_explicit(&m, sizeof(m)); |
| 2097 | +} |
| 2098 | + |
| 2099 | +static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], |
| 2100 | + const u8 private_key[CURVE25519_KEY_SIZE]) |
| 2101 | +{ |
| 2102 | + struct { |
| 2103 | + u64 buffer[4 * NUM_WORDS_ELTFP25519]; |
| 2104 | + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; |
| 2105 | + u64 workspace[4 * NUM_WORDS_ELTFP25519]; |
| 2106 | + u8 private[CURVE25519_KEY_SIZE]; |
| 2107 | + } __aligned(32) m; |
| 2108 | + |
| 2109 | + const int ite[4] = { 64, 64, 64, 63 }; |
| 2110 | + const int q = 3; |
| 2111 | + u64 swap = 1; |
| 2112 | + |
| 2113 | + int i = 0, j = 0, k = 0; |
| 2114 | + u64 *const key = (u64 *)m.private; |
| 2115 | + u64 *const Ur1 = m.coordinates + 0; |
| 2116 | + u64 *const Zr1 = m.coordinates + 4; |
| 2117 | + u64 *const Ur2 = m.coordinates + 8; |
| 2118 | + u64 *const Zr2 = m.coordinates + 12; |
| 2119 | + |
| 2120 | + u64 *const UZr1 = m.coordinates + 0; |
| 2121 | + u64 *const ZUr2 = m.coordinates + 8; |
| 2122 | + |
| 2123 | + u64 *const A = m.workspace + 0; |
| 2124 | + u64 *const B = m.workspace + 4; |
| 2125 | + u64 *const C = m.workspace + 8; |
| 2126 | + u64 *const D = m.workspace + 12; |
| 2127 | + |
| 2128 | + u64 *const AB = m.workspace + 0; |
| 2129 | + u64 *const CD = m.workspace + 8; |
| 2130 | + |
| 2131 | + const u64 *const P = table_ladder_8k; |
| 2132 | + |
| 2133 | + memcpy(m.private, private_key, sizeof(m.private)); |
| 2134 | + |
| 2135 | + curve25519_clamp_secret(m.private); |
| 2136 | + |
| 2137 | + setzero_eltfp25519_1w(Ur1); |
| 2138 | + setzero_eltfp25519_1w(Zr1); |
| 2139 | + setzero_eltfp25519_1w(Zr2); |
| 2140 | + Ur1[0] = 1; |
| 2141 | + Zr1[0] = 1; |
| 2142 | + Zr2[0] = 1; |
| 2143 | + |
| 2144 | + /* G-S */ |
| 2145 | + Ur2[3] = 0x1eaecdeee27cab34UL; |
| 2146 | + Ur2[2] = 0xadc7a0b9235d48e2UL; |
| 2147 | + Ur2[1] = 0xbbf095ae14b2edf8UL; |
| 2148 | + Ur2[0] = 0x7e94e1fec82faabdUL; |
| 2149 | + |
| 2150 | + /* main-loop */ |
| 2151 | + j = q; |
| 2152 | + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { |
| 2153 | + while (j < ite[i]) { |
| 2154 | + u64 bit = (key[i] >> j) & 0x1; |
| 2155 | + k = (64 * i + j - q); |
| 2156 | + swap = swap ^ bit; |
| 2157 | + cswap(swap, Ur1, Ur2); |
| 2158 | + cswap(swap, Zr1, Zr2); |
| 2159 | + swap = bit; |
| 2160 | + /* Addition */ |
| 2161 | + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ |
| 2162 | + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ |
| 2163 | + mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ |
| 2164 | + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ |
| 2165 | + add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ |
| 2166 | + sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ |
| 2167 | + mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ |
| 2168 | + ++j; |
| 2169 | + } |
| 2170 | + j = 0; |
| 2171 | + } |
| 2172 | + |
| 2173 | + /* Doubling */ |
| 2174 | + for (i = 0; i < q; ++i) { |
| 2175 | + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ |
| 2176 | + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ |
| 2177 | + sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ |
| 2178 | + copy_eltfp25519_1w(C, B); /* C = B */ |
| 2179 | + sub_eltfp25519_1w(B, A, B); /* B = A-B */ |
| 2180 | + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ |
| 2181 | + add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ |
| 2182 | + mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ |
| 2183 | + } |
| 2184 | + |
| 2185 | + /* Convert to affine coordinates */ |
| 2186 | + inv_eltfp25519_1w_adx(A, Zr1); |
| 2187 | + mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); |
| 2188 | + fred_eltfp25519_1w((u64 *)session_key); |
| 2189 | + |
| 2190 | + memzero_explicit(&m, sizeof(m)); |
| 2191 | +} |
| 2192 | + |
| 2193 | +static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], |
| 2194 | + const u8 private_key[CURVE25519_KEY_SIZE], |
| 2195 | + const u8 session_key[CURVE25519_KEY_SIZE]) |
| 2196 | +{ |
| 2197 | + struct { |
| 2198 | + u64 buffer[4 * NUM_WORDS_ELTFP25519]; |
| 2199 | + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; |
| 2200 | + u64 workspace[6 * NUM_WORDS_ELTFP25519]; |
| 2201 | + u8 session[CURVE25519_KEY_SIZE]; |
| 2202 | + u8 private[CURVE25519_KEY_SIZE]; |
| 2203 | + } __aligned(32) m; |
| 2204 | + |
| 2205 | + int i = 0, j = 0; |
| 2206 | + u64 prev = 0; |
| 2207 | + u64 *const X1 = (u64 *)m.session; |
| 2208 | + u64 *const key = (u64 *)m.private; |
| 2209 | + u64 *const Px = m.coordinates + 0; |
| 2210 | + u64 *const Pz = m.coordinates + 4; |
| 2211 | + u64 *const Qx = m.coordinates + 8; |
| 2212 | + u64 *const Qz = m.coordinates + 12; |
| 2213 | + u64 *const X2 = Qx; |
| 2214 | + u64 *const Z2 = Qz; |
| 2215 | + u64 *const X3 = Px; |
| 2216 | + u64 *const Z3 = Pz; |
| 2217 | + u64 *const X2Z2 = Qx; |
| 2218 | + u64 *const X3Z3 = Px; |
| 2219 | + |
| 2220 | + u64 *const A = m.workspace + 0; |
| 2221 | + u64 *const B = m.workspace + 4; |
| 2222 | + u64 *const D = m.workspace + 8; |
| 2223 | + u64 *const C = m.workspace + 12; |
| 2224 | + u64 *const DA = m.workspace + 16; |
| 2225 | + u64 *const CB = m.workspace + 20; |
| 2226 | + u64 *const AB = A; |
| 2227 | + u64 *const DC = D; |
| 2228 | + u64 *const DACB = DA; |
| 2229 | + |
| 2230 | + memcpy(m.private, private_key, sizeof(m.private)); |
| 2231 | + memcpy(m.session, session_key, sizeof(m.session)); |
| 2232 | + |
| 2233 | + curve25519_clamp_secret(m.private); |
| 2234 | + |
| 2235 | + /* As in the draft: |
| 2236 | + * When receiving such an array, implementations of curve25519 |
| 2237 | + * MUST mask the most-significant bit in the final byte. This |
| 2238 | + * is done to preserve compatibility with point formats which |
| 2239 | + * reserve the sign bit for use in other protocols and to |
| 2240 | + * increase resistance to implementation fingerprinting |
| 2241 | + */ |
| 2242 | + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; |
| 2243 | + |
| 2244 | + copy_eltfp25519_1w(Px, X1); |
| 2245 | + setzero_eltfp25519_1w(Pz); |
| 2246 | + setzero_eltfp25519_1w(Qx); |
| 2247 | + setzero_eltfp25519_1w(Qz); |
| 2248 | + |
| 2249 | + Pz[0] = 1; |
| 2250 | + Qx[0] = 1; |
| 2251 | + |
| 2252 | + /* main-loop */ |
| 2253 | + prev = 0; |
| 2254 | + j = 62; |
| 2255 | + for (i = 3; i >= 0; --i) { |
| 2256 | + while (j >= 0) { |
| 2257 | + u64 bit = (key[i] >> j) & 0x1; |
| 2258 | + u64 swap = bit ^ prev; |
| 2259 | + prev = bit; |
| 2260 | + |
| 2261 | + add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ |
| 2262 | + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ |
| 2263 | + add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ |
| 2264 | + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ |
| 2265 | + mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ |
| 2266 | + |
| 2267 | + cselect(swap, A, C); |
| 2268 | + cselect(swap, B, D); |
| 2269 | + |
| 2270 | + sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ |
| 2271 | + add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ |
| 2272 | + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ |
| 2273 | + sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ |
| 2274 | + |
| 2275 | + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ |
| 2276 | + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ |
| 2277 | + |
| 2278 | + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ |
| 2279 | + add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ |
| 2280 | + mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ |
| 2281 | + mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ |
| 2282 | + --j; |
| 2283 | + } |
| 2284 | + j = 63; |
| 2285 | + } |
| 2286 | + |
| 2287 | + inv_eltfp25519_1w_bmi2(A, Qz); |
| 2288 | + mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); |
| 2289 | + fred_eltfp25519_1w((u64 *)shared); |
| 2290 | + |
| 2291 | + memzero_explicit(&m, sizeof(m)); |
| 2292 | +} |
| 2293 | + |
| 2294 | +static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], |
| 2295 | + const u8 private_key[CURVE25519_KEY_SIZE]) |
| 2296 | +{ |
| 2297 | + struct { |
| 2298 | + u64 buffer[4 * NUM_WORDS_ELTFP25519]; |
| 2299 | + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; |
| 2300 | + u64 workspace[4 * NUM_WORDS_ELTFP25519]; |
| 2301 | + u8 private[CURVE25519_KEY_SIZE]; |
| 2302 | + } __aligned(32) m; |
| 2303 | + |
| 2304 | + const int ite[4] = { 64, 64, 64, 63 }; |
| 2305 | + const int q = 3; |
| 2306 | + u64 swap = 1; |
| 2307 | + |
| 2308 | + int i = 0, j = 0, k = 0; |
| 2309 | + u64 *const key = (u64 *)m.private; |
| 2310 | + u64 *const Ur1 = m.coordinates + 0; |
| 2311 | + u64 *const Zr1 = m.coordinates + 4; |
| 2312 | + u64 *const Ur2 = m.coordinates + 8; |
| 2313 | + u64 *const Zr2 = m.coordinates + 12; |
| 2314 | + |
| 2315 | + u64 *const UZr1 = m.coordinates + 0; |
| 2316 | + u64 *const ZUr2 = m.coordinates + 8; |
| 2317 | + |
| 2318 | + u64 *const A = m.workspace + 0; |
| 2319 | + u64 *const B = m.workspace + 4; |
| 2320 | + u64 *const C = m.workspace + 8; |
| 2321 | + u64 *const D = m.workspace + 12; |
| 2322 | + |
| 2323 | + u64 *const AB = m.workspace + 0; |
| 2324 | + u64 *const CD = m.workspace + 8; |
| 2325 | + |
| 2326 | + const u64 *const P = table_ladder_8k; |
| 2327 | + |
| 2328 | + memcpy(m.private, private_key, sizeof(m.private)); |
| 2329 | + |
| 2330 | + curve25519_clamp_secret(m.private); |
| 2331 | + |
| 2332 | + setzero_eltfp25519_1w(Ur1); |
| 2333 | + setzero_eltfp25519_1w(Zr1); |
| 2334 | + setzero_eltfp25519_1w(Zr2); |
| 2335 | + Ur1[0] = 1; |
| 2336 | + Zr1[0] = 1; |
| 2337 | + Zr2[0] = 1; |
| 2338 | + |
| 2339 | + /* G-S */ |
| 2340 | + Ur2[3] = 0x1eaecdeee27cab34UL; |
| 2341 | + Ur2[2] = 0xadc7a0b9235d48e2UL; |
| 2342 | + Ur2[1] = 0xbbf095ae14b2edf8UL; |
| 2343 | + Ur2[0] = 0x7e94e1fec82faabdUL; |
| 2344 | + |
| 2345 | + /* main-loop */ |
| 2346 | + j = q; |
| 2347 | + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { |
| 2348 | + while (j < ite[i]) { |
| 2349 | + u64 bit = (key[i] >> j) & 0x1; |
| 2350 | + k = (64 * i + j - q); |
| 2351 | + swap = swap ^ bit; |
| 2352 | + cswap(swap, Ur1, Ur2); |
| 2353 | + cswap(swap, Zr1, Zr2); |
| 2354 | + swap = bit; |
| 2355 | + /* Addition */ |
| 2356 | + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ |
| 2357 | + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ |
| 2358 | + mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ |
| 2359 | + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ |
| 2360 | + add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ |
| 2361 | + sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ |
| 2362 | + mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ |
| 2363 | + ++j; |
| 2364 | + } |
| 2365 | + j = 0; |
| 2366 | + } |
| 2367 | + |
| 2368 | + /* Doubling */ |
| 2369 | + for (i = 0; i < q; ++i) { |
| 2370 | + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ |
| 2371 | + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ |
| 2372 | + sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ |
| 2373 | + copy_eltfp25519_1w(C, B); /* C = B */ |
| 2374 | + sub_eltfp25519_1w(B, A, B); /* B = A-B */ |
| 2375 | + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ |
| 2376 | + add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ |
| 2377 | + mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ |
| 2378 | + } |
| 2379 | + |
| 2380 | + /* Convert to affine coordinates */ |
| 2381 | + inv_eltfp25519_1w_bmi2(A, Zr1); |
| 2382 | + mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); |
| 2383 | + fred_eltfp25519_1w((u64 *)session_key); |
| 2384 | + |
| 2385 | + memzero_explicit(&m, sizeof(m)); |
| 2386 | +} |
| 2387 | + |
| 2388 | +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], |
| 2389 | + const u8 secret[CURVE25519_KEY_SIZE], |
| 2390 | + const u8 basepoint[CURVE25519_KEY_SIZE]) |
| 2391 | +{ |
| 2392 | + if (static_branch_likely(&curve25519_use_adx)) |
| 2393 | + curve25519_adx(mypublic, secret, basepoint); |
| 2394 | + else if (static_branch_likely(&curve25519_use_bmi2)) |
| 2395 | + curve25519_bmi2(mypublic, secret, basepoint); |
| 2396 | + else |
| 2397 | + curve25519_generic(mypublic, secret, basepoint); |
| 2398 | +} |
| 2399 | +EXPORT_SYMBOL(curve25519_arch); |
| 2400 | + |
| 2401 | +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], |
| 2402 | + const u8 secret[CURVE25519_KEY_SIZE]) |
| 2403 | +{ |
| 2404 | + if (static_branch_likely(&curve25519_use_adx)) |
| 2405 | + curve25519_adx_base(pub, secret); |
| 2406 | + else if (static_branch_likely(&curve25519_use_bmi2)) |
| 2407 | + curve25519_bmi2_base(pub, secret); |
| 2408 | + else |
| 2409 | + curve25519_generic(pub, secret, curve25519_base_point); |
| 2410 | +} |
| 2411 | +EXPORT_SYMBOL(curve25519_base_arch); |
| 2412 | + |
| 2413 | +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, |
| 2414 | + unsigned int len) |
| 2415 | +{ |
| 2416 | + u8 *secret = kpp_tfm_ctx(tfm); |
| 2417 | + |
| 2418 | + if (!len) |
| 2419 | + curve25519_generate_secret(secret); |
| 2420 | + else if (len == CURVE25519_KEY_SIZE && |
| 2421 | + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) |
| 2422 | + memcpy(secret, buf, CURVE25519_KEY_SIZE); |
| 2423 | + else |
| 2424 | + return -EINVAL; |
| 2425 | + return 0; |
| 2426 | +} |
| 2427 | + |
| 2428 | +static int curve25519_generate_public_key(struct kpp_request *req) |
| 2429 | +{ |
| 2430 | + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); |
| 2431 | + const u8 *secret = kpp_tfm_ctx(tfm); |
| 2432 | + u8 buf[CURVE25519_KEY_SIZE]; |
| 2433 | + int copied, nbytes; |
| 2434 | + |
| 2435 | + if (req->src) |
| 2436 | + return -EINVAL; |
| 2437 | + |
| 2438 | + curve25519_base_arch(buf, secret); |
| 2439 | + |
| 2440 | + /* might want less than we've got */ |
| 2441 | + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); |
| 2442 | + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, |
| 2443 | + nbytes), |
| 2444 | + buf, nbytes); |
| 2445 | + if (copied != nbytes) |
| 2446 | + return -EINVAL; |
| 2447 | + return 0; |
| 2448 | +} |
| 2449 | + |
| 2450 | +static int curve25519_compute_shared_secret(struct kpp_request *req) |
| 2451 | +{ |
| 2452 | + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); |
| 2453 | + const u8 *secret = kpp_tfm_ctx(tfm); |
| 2454 | + u8 public_key[CURVE25519_KEY_SIZE]; |
| 2455 | + u8 buf[CURVE25519_KEY_SIZE]; |
| 2456 | + int copied, nbytes; |
| 2457 | + |
| 2458 | + if (!req->src) |
| 2459 | + return -EINVAL; |
| 2460 | + |
| 2461 | + copied = sg_copy_to_buffer(req->src, |
| 2462 | + sg_nents_for_len(req->src, |
| 2463 | + CURVE25519_KEY_SIZE), |
| 2464 | + public_key, CURVE25519_KEY_SIZE); |
| 2465 | + if (copied != CURVE25519_KEY_SIZE) |
| 2466 | + return -EINVAL; |
| 2467 | + |
| 2468 | + curve25519_arch(buf, secret, public_key); |
| 2469 | + |
| 2470 | + /* might want less than we've got */ |
| 2471 | + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); |
| 2472 | + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, |
| 2473 | + nbytes), |
| 2474 | + buf, nbytes); |
| 2475 | + if (copied != nbytes) |
| 2476 | + return -EINVAL; |
| 2477 | + return 0; |
| 2478 | +} |
| 2479 | + |
| 2480 | +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) |
| 2481 | +{ |
| 2482 | + return CURVE25519_KEY_SIZE; |
| 2483 | +} |
| 2484 | + |
| 2485 | +static struct kpp_alg curve25519_alg = { |
| 2486 | + .base.cra_name = "curve25519", |
| 2487 | + .base.cra_driver_name = "curve25519-x86", |
| 2488 | + .base.cra_priority = 200, |
| 2489 | + .base.cra_module = THIS_MODULE, |
| 2490 | + .base.cra_ctxsize = CURVE25519_KEY_SIZE, |
| 2491 | + |
| 2492 | + .set_secret = curve25519_set_secret, |
| 2493 | + .generate_public_key = curve25519_generate_public_key, |
| 2494 | + .compute_shared_secret = curve25519_compute_shared_secret, |
| 2495 | + .max_size = curve25519_max_size, |
| 2496 | +}; |
| 2497 | + |
| 2498 | +static int __init curve25519_mod_init(void) |
| 2499 | +{ |
| 2500 | + if (boot_cpu_has(X86_FEATURE_BMI2)) |
| 2501 | + static_branch_enable(&curve25519_use_bmi2); |
| 2502 | + else if (boot_cpu_has(X86_FEATURE_ADX)) |
| 2503 | + static_branch_enable(&curve25519_use_adx); |
| 2504 | + else |
| 2505 | + return 0; |
| 2506 | + return crypto_register_kpp(&curve25519_alg); |
| 2507 | +} |
| 2508 | + |
| 2509 | +static void __exit curve25519_mod_exit(void) |
| 2510 | +{ |
| 2511 | + if (boot_cpu_has(X86_FEATURE_BMI2) || |
| 2512 | + boot_cpu_has(X86_FEATURE_ADX)) |
| 2513 | + crypto_unregister_kpp(&curve25519_alg); |
| 2514 | +} |
| 2515 | + |
| 2516 | +module_init(curve25519_mod_init); |
| 2517 | +module_exit(curve25519_mod_exit); |
| 2518 | + |
| 2519 | +MODULE_ALIAS_CRYPTO("curve25519"); |
| 2520 | +MODULE_ALIAS_CRYPTO("curve25519-x86"); |
| 2521 | +MODULE_LICENSE("GPL v2"); |
| 2522 | --- a/crypto/Kconfig |
| 2523 | +++ b/crypto/Kconfig |
| 2524 | @@ -269,6 +269,12 @@ config CRYPTO_CURVE25519 |
| 2525 | select CRYPTO_KPP |
| 2526 | select CRYPTO_LIB_CURVE25519_GENERIC |
| 2527 | |
| 2528 | +config CRYPTO_CURVE25519_X86 |
| 2529 | + tristate "x86_64 accelerated Curve25519 scalar multiplication library" |
| 2530 | + depends on X86 && 64BIT |
| 2531 | + select CRYPTO_LIB_CURVE25519_GENERIC |
| 2532 | + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 |
| 2533 | + |
| 2534 | comment "Authenticated Encryption with Associated Data" |
| 2535 | |
| 2536 | config CRYPTO_CCM |