Blame - target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch - T108

blob: 13003b2cdacaf1bfeaead512e89162504b58d4ae [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
				2	From: "Jason A. Donenfeld" <Jason@zx2c4.com>
				3	Date: Fri, 8 Nov 2019 13:22:32 +0100
				4	Subject: [PATCH] crypto: curve25519 - generic C library implementations
				5
				6	commit 0ed42a6f431e930b2e8fae21955406e09fe75d70 upstream.
				7
				8	This contains two formally verified C implementations of the Curve25519
				9	scalar multiplication function, one for 32-bit systems, and one for
				10	64-bit systems whose compiler supports efficient 128-bit integer types.
				11	Not only are these implementations formally verified, but they are also
				12	the fastest available C implementations. They have been modified to be
				13	friendly to kernel space and to be generally less horrendous looking,
				14	but still an effort has been made to retain their formally verified
				15	characteristic, and so the C might look slightly unidiomatic.
				16
				17	The 64-bit version comes from HACL*: https://github.com/project-everest/hacl-star
				18	The 32-bit version comes from Fiat: https://github.com/mit-plv/fiat-crypto
				19
				20	Information: https://cr.yp.to/ecdh.html
				21
				22	Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
				23	[ardb: - move from lib/zinc to lib/crypto
				24	- replace .c #includes with Kconfig based object selection
				25	- drop simd handling and simplify support for per-arch versions ]
				26	Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
				27	Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
				28	Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
				29	---
				30	include/crypto/curve25519.h \| 71 +++
				31	lib/crypto/Kconfig \| 25 +
				32	lib/crypto/Makefile \| 5 +
				33	lib/crypto/curve25519-fiat32.c \| 864 +++++++++++++++++++++++++++++++++
				34	lib/crypto/curve25519-hacl64.c \| 788 ++++++++++++++++++++++++++++++
				35	lib/crypto/curve25519.c \| 25 +
				36	6 files changed, 1778 insertions(+)
				37	create mode 100644 include/crypto/curve25519.h
				38	create mode 100644 lib/crypto/curve25519-fiat32.c
				39	create mode 100644 lib/crypto/curve25519-hacl64.c
				40	create mode 100644 lib/crypto/curve25519.c
				41
				42	--- /dev/null
				43	+++ b/include/crypto/curve25519.h
				44	@@ -0,0 +1,71 @@
				45	+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
				46	+/*
				47	+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				48	+ */
				49	+
				50	+#ifndef CURVE25519_H
				51	+#define CURVE25519_H
				52	+
				53	+#include <crypto/algapi.h> // For crypto_memneq.
				54	+#include <linux/types.h>
				55	+#include <linux/random.h>
				56	+
				57	+enum curve25519_lengths {
				58	+ CURVE25519_KEY_SIZE = 32
				59	+};
				60	+
				61	+extern const u8 curve25519_null_point[];
				62	+extern const u8 curve25519_base_point[];
				63	+
				64	+void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
				65	+ const u8 scalar[CURVE25519_KEY_SIZE],
				66	+ const u8 point[CURVE25519_KEY_SIZE]);
				67	+
				68	+void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
				69	+ const u8 scalar[CURVE25519_KEY_SIZE],
				70	+ const u8 point[CURVE25519_KEY_SIZE]);
				71	+
				72	+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
				73	+ const u8 secret[CURVE25519_KEY_SIZE]);
				74	+
				75	+static inline
				76	+bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
				77	+ const u8 secret[CURVE25519_KEY_SIZE],
				78	+ const u8 basepoint[CURVE25519_KEY_SIZE])
				79	+{
				80	+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
				81	+ curve25519_arch(mypublic, secret, basepoint);
				82	+ else
				83	+ curve25519_generic(mypublic, secret, basepoint);
				84	+ return crypto_memneq(mypublic, curve25519_null_point,
				85	+ CURVE25519_KEY_SIZE);
				86	+}
				87	+
				88	+static inline bool
				89	+__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
				90	+ const u8 secret[CURVE25519_KEY_SIZE])
				91	+{
				92	+ if (unlikely(!crypto_memneq(secret, curve25519_null_point,
				93	+ CURVE25519_KEY_SIZE)))
				94	+ return false;
				95	+
				96	+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
				97	+ curve25519_base_arch(pub, secret);
				98	+ else
				99	+ curve25519_generic(pub, secret, curve25519_base_point);
				100	+ return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
				101	+}
				102	+
				103	+static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
				104	+{
				105	+ secret[0] &= 248;
				106	+ secret[31] = (secret[31] & 127) \| 64;
				107	+}
				108	+
				109	+static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
				110	+{
				111	+ get_random_bytes_wait(secret, CURVE25519_KEY_SIZE);
				112	+ curve25519_clamp_secret(secret);
				113	+}
				114	+
				115	+#endif /* CURVE25519_H */
				116	--- a/lib/crypto/Kconfig
				117	+++ b/lib/crypto/Kconfig
				118	@@ -34,6 +34,31 @@ config CRYPTO_LIB_CHACHA
				119	by either the generic implementation or an arch-specific one, if one
				120	is available and enabled.
				121
				122	+config CRYPTO_ARCH_HAVE_LIB_CURVE25519
				123	+ tristate
				124	+ help
				125	+ Declares whether the architecture provides an arch-specific
				126	+ accelerated implementation of the Curve25519 library interface,
				127	+ either builtin or as a module.
				128	+
				129	+config CRYPTO_LIB_CURVE25519_GENERIC
				130	+ tristate
				131	+ help
				132	+ This symbol can be depended upon by arch implementations of the
				133	+ Curve25519 library interface that require the generic code as a
				134	+ fallback, e.g., for SIMD implementations. If no arch specific
				135	+ implementation is enabled, this implementation serves the users
				136	+ of CRYPTO_LIB_CURVE25519.
				137	+
				138	+config CRYPTO_LIB_CURVE25519
				139	+ tristate "Curve25519 scalar multiplication library"
				140	+ depends on CRYPTO_ARCH_HAVE_LIB_CURVE25519 \|\| !CRYPTO_ARCH_HAVE_LIB_CURVE25519
				141	+ select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
				142	+ help
				143	+ Enable the Curve25519 library interface. This interface may be
				144	+ fulfilled by either the generic implementation or an arch-specific
				145	+ one, if one is available and enabled.
				146	+
				147	config CRYPTO_LIB_DES
				148	tristate
				149
				150	--- a/lib/crypto/Makefile
				151	+++ b/lib/crypto/Makefile
				152	@@ -10,6 +10,11 @@ libaes-y := aes.o
				153	obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o
				154	libarc4-y := arc4.o
				155
				156	+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519.o
				157	+libcurve25519-y := curve25519-fiat32.o
				158	+libcurve25519-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o
				159	+libcurve25519-y += curve25519.o
				160	+
				161	obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
				162	libdes-y := des.o
				163
				164	--- /dev/null
				165	+++ b/lib/crypto/curve25519-fiat32.c
				166	@@ -0,0 +1,864 @@
				167	+// SPDX-License-Identifier: GPL-2.0 OR MIT
				168	+/*
				169	+ * Copyright (C) 2015-2016 The fiat-crypto Authors.
				170	+ * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				171	+ *
				172	+ * This is a machine-generated formally verified implementation of Curve25519
				173	+ * ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally
				174	+ * machine generated, it has been tweaked to be suitable for use in the kernel.
				175	+ * It is optimized for 32-bit machines and machines that cannot work efficiently
				176	+ * with 128-bit integer types.
				177	+ */
				178	+
				179	+#include <asm/unaligned.h>
				180	+#include <crypto/curve25519.h>
				181	+#include <linux/string.h>
				182	+
				183	+/* fe means field element. Here the field is \Z/(2^255-19). An element t,
				184	+ * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
				185	+ * t[3]+2^102 t[4]+...+2^230 t[9].
				186	+ * fe limbs are bounded by 1.1252^26,1.1252^25,1.1252^26,1.1252^25,etc.
				187	+ * Multiplication and carrying produce fe from fe_loose.
				188	+ */
				189	+typedef struct fe { u32 v[10]; } fe;
				190	+
				191	+/* fe_loose limbs are bounded by 3.3752^26,3.3752^25,3.3752^26,3.3752^25,etc
				192	+ * Addition and subtraction produce fe_loose from (fe, fe).
				193	+ */
				194	+typedef struct fe_loose { u32 v[10]; } fe_loose;
				195	+
				196	+static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s)
				197	+{
				198	+ /* Ignores top bit of s. */
				199	+ u32 a0 = get_unaligned_le32(s);
				200	+ u32 a1 = get_unaligned_le32(s+4);
				201	+ u32 a2 = get_unaligned_le32(s+8);
				202	+ u32 a3 = get_unaligned_le32(s+12);
				203	+ u32 a4 = get_unaligned_le32(s+16);
				204	+ u32 a5 = get_unaligned_le32(s+20);
				205	+ u32 a6 = get_unaligned_le32(s+24);
				206	+ u32 a7 = get_unaligned_le32(s+28);
				207	+ h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */
				208	+ h[1] = (a0>>26) \| ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */
				209	+ h[2] = (a1>>19) \| ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */
				210	+ h[3] = (a2>>13) \| ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */
				211	+ h[4] = (a3>> 6); /* (32- 6) = 26 */
				212	+ h[5] = a4&((1<<25)-1); /* 25 */
				213	+ h[6] = (a4>>25) \| ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */
				214	+ h[7] = (a5>>19) \| ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */
				215	+ h[8] = (a6>>12) \| ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */
				216	+ h[9] = (a7>> 6)&((1<<25)-1); /* 25 */
				217	+}
				218	+
				219	+static __always_inline void fe_frombytes(fe h, const u8 s)
				220	+{
				221	+ fe_frombytes_impl(h->v, s);
				222	+}
				223	+
				224	+static __always_inline u8 /bool/
				225	+addcarryx_u25(u8 /bool/ c, u32 a, u32 b, u32 *low)
				226	+{
				227	+ /* This function extracts 25 bits of result and 1 bit of carry
				228	+ * (26 total), so a 32-bit intermediate is sufficient.
				229	+ */
				230	+ u32 x = a + b + c;
				231	+ *low = x & ((1 << 25) - 1);
				232	+ return (x >> 25) & 1;
				233	+}
				234	+
				235	+static __always_inline u8 /bool/
				236	+addcarryx_u26(u8 /bool/ c, u32 a, u32 b, u32 *low)
				237	+{
				238	+ /* This function extracts 26 bits of result and 1 bit of carry
				239	+ * (27 total), so a 32-bit intermediate is sufficient.
				240	+ */
				241	+ u32 x = a + b + c;
				242	+ *low = x & ((1 << 26) - 1);
				243	+ return (x >> 26) & 1;
				244	+}
				245	+
				246	+static __always_inline u8 /bool/
				247	+subborrow_u25(u8 /bool/ c, u32 a, u32 b, u32 *low)
				248	+{
				249	+ /* This function extracts 25 bits of result and 1 bit of borrow
				250	+ * (26 total), so a 32-bit intermediate is sufficient.
				251	+ */
				252	+ u32 x = a - b - c;
				253	+ *low = x & ((1 << 25) - 1);
				254	+ return x >> 31;
				255	+}
				256	+
				257	+static __always_inline u8 /bool/
				258	+subborrow_u26(u8 /bool/ c, u32 a, u32 b, u32 *low)
				259	+{
				260	+ /* This function extracts 26 bits of result and 1 bit of borrow
				261	+ *(27 total), so a 32-bit intermediate is sufficient.
				262	+ */
				263	+ u32 x = a - b - c;
				264	+ *low = x & ((1 << 26) - 1);
				265	+ return x >> 31;
				266	+}
				267	+
				268	+static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz)
				269	+{
				270	+ t = -!!t; /* all set if nonzero, 0 if 0 */
				271	+ return (t&nz) \| ((~t)&z);
				272	+}
				273	+
				274	+static __always_inline void fe_freeze(u32 out[10], const u32 in1[10])
				275	+{
				276	+ { const u32 x17 = in1[9];
				277	+ { const u32 x18 = in1[8];
				278	+ { const u32 x16 = in1[7];
				279	+ { const u32 x14 = in1[6];
				280	+ { const u32 x12 = in1[5];
				281	+ { const u32 x10 = in1[4];
				282	+ { const u32 x8 = in1[3];
				283	+ { const u32 x6 = in1[2];
				284	+ { const u32 x4 = in1[1];
				285	+ { const u32 x2 = in1[0];
				286	+ { u32 x20; u8/bool/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
				287	+ { u32 x23; u8/bool/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
				288	+ { u32 x26; u8/bool/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
				289	+ { u32 x29; u8/bool/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
				290	+ { u32 x32; u8/bool/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
				291	+ { u32 x35; u8/bool/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
				292	+ { u32 x38; u8/bool/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
				293	+ { u32 x41; u8/bool/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
				294	+ { u32 x44; u8/bool/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
				295	+ { u32 x47; u8/bool/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
				296	+ { u32 x49 = cmovznz32(x48, 0x0, 0xffffffff);
				297	+ { u32 x50 = (x49 & 0x3ffffed);
				298	+ { u32 x52; u8/bool/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
				299	+ { u32 x54 = (x49 & 0x1ffffff);
				300	+ { u32 x56; u8/bool/ x57 = addcarryx_u25(x53, x23, x54, &x56);
				301	+ { u32 x58 = (x49 & 0x3ffffff);
				302	+ { u32 x60; u8/bool/ x61 = addcarryx_u26(x57, x26, x58, &x60);
				303	+ { u32 x62 = (x49 & 0x1ffffff);
				304	+ { u32 x64; u8/bool/ x65 = addcarryx_u25(x61, x29, x62, &x64);
				305	+ { u32 x66 = (x49 & 0x3ffffff);
				306	+ { u32 x68; u8/bool/ x69 = addcarryx_u26(x65, x32, x66, &x68);
				307	+ { u32 x70 = (x49 & 0x1ffffff);
				308	+ { u32 x72; u8/bool/ x73 = addcarryx_u25(x69, x35, x70, &x72);
				309	+ { u32 x74 = (x49 & 0x3ffffff);
				310	+ { u32 x76; u8/bool/ x77 = addcarryx_u26(x73, x38, x74, &x76);
				311	+ { u32 x78 = (x49 & 0x1ffffff);
				312	+ { u32 x80; u8/bool/ x81 = addcarryx_u25(x77, x41, x78, &x80);
				313	+ { u32 x82 = (x49 & 0x3ffffff);
				314	+ { u32 x84; u8/bool/ x85 = addcarryx_u26(x81, x44, x82, &x84);
				315	+ { u32 x86 = (x49 & 0x1ffffff);
				316	+ { u32 x88; addcarryx_u25(x85, x47, x86, &x88);
				317	+ out[0] = x52;
				318	+ out[1] = x56;
				319	+ out[2] = x60;
				320	+ out[3] = x64;
				321	+ out[4] = x68;
				322	+ out[5] = x72;
				323	+ out[6] = x76;
				324	+ out[7] = x80;
				325	+ out[8] = x84;
				326	+ out[9] = x88;
				327	+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				328	+}
				329	+
				330	+static __always_inline void fe_tobytes(u8 s[32], const fe *f)
				331	+{
				332	+ u32 h[10];
				333	+ fe_freeze(h, f->v);
				334	+ s[0] = h[0] >> 0;
				335	+ s[1] = h[0] >> 8;
				336	+ s[2] = h[0] >> 16;
				337	+ s[3] = (h[0] >> 24) \| (h[1] << 2);
				338	+ s[4] = h[1] >> 6;
				339	+ s[5] = h[1] >> 14;
				340	+ s[6] = (h[1] >> 22) \| (h[2] << 3);
				341	+ s[7] = h[2] >> 5;
				342	+ s[8] = h[2] >> 13;
				343	+ s[9] = (h[2] >> 21) \| (h[3] << 5);
				344	+ s[10] = h[3] >> 3;
				345	+ s[11] = h[3] >> 11;
				346	+ s[12] = (h[3] >> 19) \| (h[4] << 6);
				347	+ s[13] = h[4] >> 2;
				348	+ s[14] = h[4] >> 10;
				349	+ s[15] = h[4] >> 18;
				350	+ s[16] = h[5] >> 0;
				351	+ s[17] = h[5] >> 8;
				352	+ s[18] = h[5] >> 16;
				353	+ s[19] = (h[5] >> 24) \| (h[6] << 1);
				354	+ s[20] = h[6] >> 7;
				355	+ s[21] = h[6] >> 15;
				356	+ s[22] = (h[6] >> 23) \| (h[7] << 3);
				357	+ s[23] = h[7] >> 5;
				358	+ s[24] = h[7] >> 13;
				359	+ s[25] = (h[7] >> 21) \| (h[8] << 4);
				360	+ s[26] = h[8] >> 4;
				361	+ s[27] = h[8] >> 12;
				362	+ s[28] = (h[8] >> 20) \| (h[9] << 6);
				363	+ s[29] = h[9] >> 2;
				364	+ s[30] = h[9] >> 10;
				365	+ s[31] = h[9] >> 18;
				366	+}
				367	+
				368	+/* h = f */
				369	+static __always_inline void fe_copy(fe h, const fe f)
				370	+{
				371	+ memmove(h, f, sizeof(u32) * 10);
				372	+}
				373	+
				374	+static __always_inline void fe_copy_lt(fe_loose h, const fe f)
				375	+{
				376	+ memmove(h, f, sizeof(u32) * 10);
				377	+}
				378	+
				379	+/* h = 0 */
				380	+static __always_inline void fe_0(fe *h)
				381	+{
				382	+ memset(h, 0, sizeof(u32) * 10);
				383	+}
				384	+
				385	+/* h = 1 */
				386	+static __always_inline void fe_1(fe *h)
				387	+{
				388	+ memset(h, 0, sizeof(u32) * 10);
				389	+ h->v[0] = 1;
				390	+}
				391	+
				392	+static void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
				393	+{
				394	+ { const u32 x20 = in1[9];
				395	+ { const u32 x21 = in1[8];
				396	+ { const u32 x19 = in1[7];
				397	+ { const u32 x17 = in1[6];
				398	+ { const u32 x15 = in1[5];
				399	+ { const u32 x13 = in1[4];
				400	+ { const u32 x11 = in1[3];
				401	+ { const u32 x9 = in1[2];
				402	+ { const u32 x7 = in1[1];
				403	+ { const u32 x5 = in1[0];
				404	+ { const u32 x38 = in2[9];
				405	+ { const u32 x39 = in2[8];
				406	+ { const u32 x37 = in2[7];
				407	+ { const u32 x35 = in2[6];
				408	+ { const u32 x33 = in2[5];
				409	+ { const u32 x31 = in2[4];
				410	+ { const u32 x29 = in2[3];
				411	+ { const u32 x27 = in2[2];
				412	+ { const u32 x25 = in2[1];
				413	+ { const u32 x23 = in2[0];
				414	+ out[0] = (x5 + x23);
				415	+ out[1] = (x7 + x25);
				416	+ out[2] = (x9 + x27);
				417	+ out[3] = (x11 + x29);
				418	+ out[4] = (x13 + x31);
				419	+ out[5] = (x15 + x33);
				420	+ out[6] = (x17 + x35);
				421	+ out[7] = (x19 + x37);
				422	+ out[8] = (x21 + x39);
				423	+ out[9] = (x20 + x38);
				424	+ }}}}}}}}}}}}}}}}}}}}
				425	+}
				426	+
				427	+/* h = f + g
				428	+ * Can overlap h with f or g.
				429	+ */
				430	+static __always_inline void fe_add(fe_loose h, const fe f, const fe *g)
				431	+{
				432	+ fe_add_impl(h->v, f->v, g->v);
				433	+}
				434	+
				435	+static void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
				436	+{
				437	+ { const u32 x20 = in1[9];
				438	+ { const u32 x21 = in1[8];
				439	+ { const u32 x19 = in1[7];
				440	+ { const u32 x17 = in1[6];
				441	+ { const u32 x15 = in1[5];
				442	+ { const u32 x13 = in1[4];
				443	+ { const u32 x11 = in1[3];
				444	+ { const u32 x9 = in1[2];
				445	+ { const u32 x7 = in1[1];
				446	+ { const u32 x5 = in1[0];
				447	+ { const u32 x38 = in2[9];
				448	+ { const u32 x39 = in2[8];
				449	+ { const u32 x37 = in2[7];
				450	+ { const u32 x35 = in2[6];
				451	+ { const u32 x33 = in2[5];
				452	+ { const u32 x31 = in2[4];
				453	+ { const u32 x29 = in2[3];
				454	+ { const u32 x27 = in2[2];
				455	+ { const u32 x25 = in2[1];
				456	+ { const u32 x23 = in2[0];
				457	+ out[0] = ((0x7ffffda + x5) - x23);
				458	+ out[1] = ((0x3fffffe + x7) - x25);
				459	+ out[2] = ((0x7fffffe + x9) - x27);
				460	+ out[3] = ((0x3fffffe + x11) - x29);
				461	+ out[4] = ((0x7fffffe + x13) - x31);
				462	+ out[5] = ((0x3fffffe + x15) - x33);
				463	+ out[6] = ((0x7fffffe + x17) - x35);
				464	+ out[7] = ((0x3fffffe + x19) - x37);
				465	+ out[8] = ((0x7fffffe + x21) - x39);
				466	+ out[9] = ((0x3fffffe + x20) - x38);
				467	+ }}}}}}}}}}}}}}}}}}}}
				468	+}
				469	+
				470	+/* h = f - g
				471	+ * Can overlap h with f or g.
				472	+ */
				473	+static __always_inline void fe_sub(fe_loose h, const fe f, const fe *g)
				474	+{
				475	+ fe_sub_impl(h->v, f->v, g->v);
				476	+}
				477	+
				478	+static void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
				479	+{
				480	+ { const u32 x20 = in1[9];
				481	+ { const u32 x21 = in1[8];
				482	+ { const u32 x19 = in1[7];
				483	+ { const u32 x17 = in1[6];
				484	+ { const u32 x15 = in1[5];
				485	+ { const u32 x13 = in1[4];
				486	+ { const u32 x11 = in1[3];
				487	+ { const u32 x9 = in1[2];
				488	+ { const u32 x7 = in1[1];
				489	+ { const u32 x5 = in1[0];
				490	+ { const u32 x38 = in2[9];
				491	+ { const u32 x39 = in2[8];
				492	+ { const u32 x37 = in2[7];
				493	+ { const u32 x35 = in2[6];
				494	+ { const u32 x33 = in2[5];
				495	+ { const u32 x31 = in2[4];
				496	+ { const u32 x29 = in2[3];
				497	+ { const u32 x27 = in2[2];
				498	+ { const u32 x25 = in2[1];
				499	+ { const u32 x23 = in2[0];
				500	+ { u64 x40 = ((u64)x23 * x5);
				501	+ { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
				502	+ { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
				503	+ { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
				504	+ { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
				505	+ { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
				506	+ { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
				507	+ { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
				508	+ { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
				509	+ { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
				510	+ { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
				511	+ { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
				512	+ { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
				513	+ { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
				514	+ { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
				515	+ { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
				516	+ { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
				517	+ { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
				518	+ { u64 x58 = ((u64)(0x2 * x38) * x20);
				519	+ { u64 x59 = (x48 + (x58 << 0x4));
				520	+ { u64 x60 = (x59 + (x58 << 0x1));
				521	+ { u64 x61 = (x60 + x58);
				522	+ { u64 x62 = (x47 + (x57 << 0x4));
				523	+ { u64 x63 = (x62 + (x57 << 0x1));
				524	+ { u64 x64 = (x63 + x57);
				525	+ { u64 x65 = (x46 + (x56 << 0x4));
				526	+ { u64 x66 = (x65 + (x56 << 0x1));
				527	+ { u64 x67 = (x66 + x56);
				528	+ { u64 x68 = (x45 + (x55 << 0x4));
				529	+ { u64 x69 = (x68 + (x55 << 0x1));
				530	+ { u64 x70 = (x69 + x55);
				531	+ { u64 x71 = (x44 + (x54 << 0x4));
				532	+ { u64 x72 = (x71 + (x54 << 0x1));
				533	+ { u64 x73 = (x72 + x54);
				534	+ { u64 x74 = (x43 + (x53 << 0x4));
				535	+ { u64 x75 = (x74 + (x53 << 0x1));
				536	+ { u64 x76 = (x75 + x53);
				537	+ { u64 x77 = (x42 + (x52 << 0x4));
				538	+ { u64 x78 = (x77 + (x52 << 0x1));
				539	+ { u64 x79 = (x78 + x52);
				540	+ { u64 x80 = (x41 + (x51 << 0x4));
				541	+ { u64 x81 = (x80 + (x51 << 0x1));
				542	+ { u64 x82 = (x81 + x51);
				543	+ { u64 x83 = (x40 + (x50 << 0x4));
				544	+ { u64 x84 = (x83 + (x50 << 0x1));
				545	+ { u64 x85 = (x84 + x50);
				546	+ { u64 x86 = (x85 >> 0x1a);
				547	+ { u32 x87 = ((u32)x85 & 0x3ffffff);
				548	+ { u64 x88 = (x86 + x82);
				549	+ { u64 x89 = (x88 >> 0x19);
				550	+ { u32 x90 = ((u32)x88 & 0x1ffffff);
				551	+ { u64 x91 = (x89 + x79);
				552	+ { u64 x92 = (x91 >> 0x1a);
				553	+ { u32 x93 = ((u32)x91 & 0x3ffffff);
				554	+ { u64 x94 = (x92 + x76);
				555	+ { u64 x95 = (x94 >> 0x19);
				556	+ { u32 x96 = ((u32)x94 & 0x1ffffff);
				557	+ { u64 x97 = (x95 + x73);
				558	+ { u64 x98 = (x97 >> 0x1a);
				559	+ { u32 x99 = ((u32)x97 & 0x3ffffff);
				560	+ { u64 x100 = (x98 + x70);
				561	+ { u64 x101 = (x100 >> 0x19);
				562	+ { u32 x102 = ((u32)x100 & 0x1ffffff);
				563	+ { u64 x103 = (x101 + x67);
				564	+ { u64 x104 = (x103 >> 0x1a);
				565	+ { u32 x105 = ((u32)x103 & 0x3ffffff);
				566	+ { u64 x106 = (x104 + x64);
				567	+ { u64 x107 = (x106 >> 0x19);
				568	+ { u32 x108 = ((u32)x106 & 0x1ffffff);
				569	+ { u64 x109 = (x107 + x61);
				570	+ { u64 x110 = (x109 >> 0x1a);
				571	+ { u32 x111 = ((u32)x109 & 0x3ffffff);
				572	+ { u64 x112 = (x110 + x49);
				573	+ { u64 x113 = (x112 >> 0x19);
				574	+ { u32 x114 = ((u32)x112 & 0x1ffffff);
				575	+ { u64 x115 = (x87 + (0x13 * x113));
				576	+ { u32 x116 = (u32) (x115 >> 0x1a);
				577	+ { u32 x117 = ((u32)x115 & 0x3ffffff);
				578	+ { u32 x118 = (x116 + x90);
				579	+ { u32 x119 = (x118 >> 0x19);
				580	+ { u32 x120 = (x118 & 0x1ffffff);
				581	+ out[0] = x117;
				582	+ out[1] = x120;
				583	+ out[2] = (x119 + x93);
				584	+ out[3] = x96;
				585	+ out[4] = x99;
				586	+ out[5] = x102;
				587	+ out[6] = x105;
				588	+ out[7] = x108;
				589	+ out[8] = x111;
				590	+ out[9] = x114;
				591	+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				592	+}
				593	+
				594	+static __always_inline void fe_mul_ttt(fe h, const fe f, const fe *g)
				595	+{
				596	+ fe_mul_impl(h->v, f->v, g->v);
				597	+}
				598	+
				599	+static __always_inline void fe_mul_tlt(fe h, const fe_loose f, const fe *g)
				600	+{
				601	+ fe_mul_impl(h->v, f->v, g->v);
				602	+}
				603	+
				604	+static __always_inline void
				605	+fe_mul_tll(fe h, const fe_loose f, const fe_loose *g)
				606	+{
				607	+ fe_mul_impl(h->v, f->v, g->v);
				608	+}
				609	+
				610	+static void fe_sqr_impl(u32 out[10], const u32 in1[10])
				611	+{
				612	+ { const u32 x17 = in1[9];
				613	+ { const u32 x18 = in1[8];
				614	+ { const u32 x16 = in1[7];
				615	+ { const u32 x14 = in1[6];
				616	+ { const u32 x12 = in1[5];
				617	+ { const u32 x10 = in1[4];
				618	+ { const u32 x8 = in1[3];
				619	+ { const u32 x6 = in1[2];
				620	+ { const u32 x4 = in1[1];
				621	+ { const u32 x2 = in1[0];
				622	+ { u64 x19 = ((u64)x2 * x2);
				623	+ { u64 x20 = ((u64)(0x2 * x2) * x4);
				624	+ { u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6)));
				625	+ { u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8)));
				626	+ { u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10));
				627	+ { u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12)));
				628	+ { u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12)));
				629	+ { u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16)));
				630	+ { u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12))))));
				631	+ { u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17)));
				632	+ { u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17)))));
				633	+ { u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17)));
				634	+ { u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17))))));
				635	+ { u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17)));
				636	+ { u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17)));
				637	+ { u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17)));
				638	+ { u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17));
				639	+ { u64 x36 = ((u64)(0x2 * x18) * x17);
				640	+ { u64 x37 = ((u64)(0x2 * x17) * x17);
				641	+ { u64 x38 = (x27 + (x37 << 0x4));
				642	+ { u64 x39 = (x38 + (x37 << 0x1));
				643	+ { u64 x40 = (x39 + x37);
				644	+ { u64 x41 = (x26 + (x36 << 0x4));
				645	+ { u64 x42 = (x41 + (x36 << 0x1));
				646	+ { u64 x43 = (x42 + x36);
				647	+ { u64 x44 = (x25 + (x35 << 0x4));
				648	+ { u64 x45 = (x44 + (x35 << 0x1));
				649	+ { u64 x46 = (x45 + x35);
				650	+ { u64 x47 = (x24 + (x34 << 0x4));
				651	+ { u64 x48 = (x47 + (x34 << 0x1));
				652	+ { u64 x49 = (x48 + x34);
				653	+ { u64 x50 = (x23 + (x33 << 0x4));
				654	+ { u64 x51 = (x50 + (x33 << 0x1));
				655	+ { u64 x52 = (x51 + x33);
				656	+ { u64 x53 = (x22 + (x32 << 0x4));
				657	+ { u64 x54 = (x53 + (x32 << 0x1));
				658	+ { u64 x55 = (x54 + x32);
				659	+ { u64 x56 = (x21 + (x31 << 0x4));
				660	+ { u64 x57 = (x56 + (x31 << 0x1));
				661	+ { u64 x58 = (x57 + x31);
				662	+ { u64 x59 = (x20 + (x30 << 0x4));
				663	+ { u64 x60 = (x59 + (x30 << 0x1));
				664	+ { u64 x61 = (x60 + x30);
				665	+ { u64 x62 = (x19 + (x29 << 0x4));
				666	+ { u64 x63 = (x62 + (x29 << 0x1));
				667	+ { u64 x64 = (x63 + x29);
				668	+ { u64 x65 = (x64 >> 0x1a);
				669	+ { u32 x66 = ((u32)x64 & 0x3ffffff);
				670	+ { u64 x67 = (x65 + x61);
				671	+ { u64 x68 = (x67 >> 0x19);
				672	+ { u32 x69 = ((u32)x67 & 0x1ffffff);
				673	+ { u64 x70 = (x68 + x58);
				674	+ { u64 x71 = (x70 >> 0x1a);
				675	+ { u32 x72 = ((u32)x70 & 0x3ffffff);
				676	+ { u64 x73 = (x71 + x55);
				677	+ { u64 x74 = (x73 >> 0x19);
				678	+ { u32 x75 = ((u32)x73 & 0x1ffffff);
				679	+ { u64 x76 = (x74 + x52);
				680	+ { u64 x77 = (x76 >> 0x1a);
				681	+ { u32 x78 = ((u32)x76 & 0x3ffffff);
				682	+ { u64 x79 = (x77 + x49);
				683	+ { u64 x80 = (x79 >> 0x19);
				684	+ { u32 x81 = ((u32)x79 & 0x1ffffff);
				685	+ { u64 x82 = (x80 + x46);
				686	+ { u64 x83 = (x82 >> 0x1a);
				687	+ { u32 x84 = ((u32)x82 & 0x3ffffff);
				688	+ { u64 x85 = (x83 + x43);
				689	+ { u64 x86 = (x85 >> 0x19);
				690	+ { u32 x87 = ((u32)x85 & 0x1ffffff);
				691	+ { u64 x88 = (x86 + x40);
				692	+ { u64 x89 = (x88 >> 0x1a);
				693	+ { u32 x90 = ((u32)x88 & 0x3ffffff);
				694	+ { u64 x91 = (x89 + x28);
				695	+ { u64 x92 = (x91 >> 0x19);
				696	+ { u32 x93 = ((u32)x91 & 0x1ffffff);
				697	+ { u64 x94 = (x66 + (0x13 * x92));
				698	+ { u32 x95 = (u32) (x94 >> 0x1a);
				699	+ { u32 x96 = ((u32)x94 & 0x3ffffff);
				700	+ { u32 x97 = (x95 + x69);
				701	+ { u32 x98 = (x97 >> 0x19);
				702	+ { u32 x99 = (x97 & 0x1ffffff);
				703	+ out[0] = x96;
				704	+ out[1] = x99;
				705	+ out[2] = (x98 + x72);
				706	+ out[3] = x75;
				707	+ out[4] = x78;
				708	+ out[5] = x81;
				709	+ out[6] = x84;
				710	+ out[7] = x87;
				711	+ out[8] = x90;
				712	+ out[9] = x93;
				713	+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				714	+}
				715	+
				716	+static __always_inline void fe_sq_tl(fe h, const fe_loose f)
				717	+{
				718	+ fe_sqr_impl(h->v, f->v);
				719	+}
				720	+
				721	+static __always_inline void fe_sq_tt(fe h, const fe f)
				722	+{
				723	+ fe_sqr_impl(h->v, f->v);
				724	+}
				725	+
				726	+static __always_inline void fe_loose_invert(fe out, const fe_loose z)
				727	+{
				728	+ fe t0;
				729	+ fe t1;
				730	+ fe t2;
				731	+ fe t3;
				732	+ int i;
				733	+
				734	+ fe_sq_tl(&t0, z);
				735	+ fe_sq_tt(&t1, &t0);
				736	+ for (i = 1; i < 2; ++i)
				737	+ fe_sq_tt(&t1, &t1);
				738	+ fe_mul_tlt(&t1, z, &t1);
				739	+ fe_mul_ttt(&t0, &t0, &t1);
				740	+ fe_sq_tt(&t2, &t0);
				741	+ fe_mul_ttt(&t1, &t1, &t2);
				742	+ fe_sq_tt(&t2, &t1);
				743	+ for (i = 1; i < 5; ++i)
				744	+ fe_sq_tt(&t2, &t2);
				745	+ fe_mul_ttt(&t1, &t2, &t1);
				746	+ fe_sq_tt(&t2, &t1);
				747	+ for (i = 1; i < 10; ++i)
				748	+ fe_sq_tt(&t2, &t2);
				749	+ fe_mul_ttt(&t2, &t2, &t1);
				750	+ fe_sq_tt(&t3, &t2);
				751	+ for (i = 1; i < 20; ++i)
				752	+ fe_sq_tt(&t3, &t3);
				753	+ fe_mul_ttt(&t2, &t3, &t2);
				754	+ fe_sq_tt(&t2, &t2);
				755	+ for (i = 1; i < 10; ++i)
				756	+ fe_sq_tt(&t2, &t2);
				757	+ fe_mul_ttt(&t1, &t2, &t1);
				758	+ fe_sq_tt(&t2, &t1);
				759	+ for (i = 1; i < 50; ++i)
				760	+ fe_sq_tt(&t2, &t2);
				761	+ fe_mul_ttt(&t2, &t2, &t1);
				762	+ fe_sq_tt(&t3, &t2);
				763	+ for (i = 1; i < 100; ++i)
				764	+ fe_sq_tt(&t3, &t3);
				765	+ fe_mul_ttt(&t2, &t3, &t2);
				766	+ fe_sq_tt(&t2, &t2);
				767	+ for (i = 1; i < 50; ++i)
				768	+ fe_sq_tt(&t2, &t2);
				769	+ fe_mul_ttt(&t1, &t2, &t1);
				770	+ fe_sq_tt(&t1, &t1);
				771	+ for (i = 1; i < 5; ++i)
				772	+ fe_sq_tt(&t1, &t1);
				773	+ fe_mul_ttt(out, &t1, &t0);
				774	+}
				775	+
				776	+static __always_inline void fe_invert(fe out, const fe z)
				777	+{
				778	+ fe_loose l;
				779	+ fe_copy_lt(&l, z);
				780	+ fe_loose_invert(out, &l);
				781	+}
				782	+
				783	+/* Replace (f,g) with (g,f) if b == 1;
				784	+ * replace (f,g) with (f,g) if b == 0.
				785	+ *
				786	+ * Preconditions: b in {0,1}
				787	+ */
				788	+static __always_inline void fe_cswap(fe f, fe g, unsigned int b)
				789	+{
				790	+ unsigned i;
				791	+ b = 0 - b;
				792	+ for (i = 0; i < 10; i++) {
				793	+ u32 x = f->v[i] ^ g->v[i];
				794	+ x &= b;
				795	+ f->v[i] ^= x;
				796	+ g->v[i] ^= x;
				797	+ }
				798	+}
				799	+
				800	+/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
				801	+static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10])
				802	+{
				803	+ { const u32 x20 = in1[9];
				804	+ { const u32 x21 = in1[8];
				805	+ { const u32 x19 = in1[7];
				806	+ { const u32 x17 = in1[6];
				807	+ { const u32 x15 = in1[5];
				808	+ { const u32 x13 = in1[4];
				809	+ { const u32 x11 = in1[3];
				810	+ { const u32 x9 = in1[2];
				811	+ { const u32 x7 = in1[1];
				812	+ { const u32 x5 = in1[0];
				813	+ { const u32 x38 = 0;
				814	+ { const u32 x39 = 0;
				815	+ { const u32 x37 = 0;
				816	+ { const u32 x35 = 0;
				817	+ { const u32 x33 = 0;
				818	+ { const u32 x31 = 0;
				819	+ { const u32 x29 = 0;
				820	+ { const u32 x27 = 0;
				821	+ { const u32 x25 = 0;
				822	+ { const u32 x23 = 121666;
				823	+ { u64 x40 = ((u64)x23 * x5);
				824	+ { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
				825	+ { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
				826	+ { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
				827	+ { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
				828	+ { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
				829	+ { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
				830	+ { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
				831	+ { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
				832	+ { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
				833	+ { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
				834	+ { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
				835	+ { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
				836	+ { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
				837	+ { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
				838	+ { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
				839	+ { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
				840	+ { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
				841	+ { u64 x58 = ((u64)(0x2 * x38) * x20);
				842	+ { u64 x59 = (x48 + (x58 << 0x4));
				843	+ { u64 x60 = (x59 + (x58 << 0x1));
				844	+ { u64 x61 = (x60 + x58);
				845	+ { u64 x62 = (x47 + (x57 << 0x4));
				846	+ { u64 x63 = (x62 + (x57 << 0x1));
				847	+ { u64 x64 = (x63 + x57);
				848	+ { u64 x65 = (x46 + (x56 << 0x4));
				849	+ { u64 x66 = (x65 + (x56 << 0x1));
				850	+ { u64 x67 = (x66 + x56);
				851	+ { u64 x68 = (x45 + (x55 << 0x4));
				852	+ { u64 x69 = (x68 + (x55 << 0x1));
				853	+ { u64 x70 = (x69 + x55);
				854	+ { u64 x71 = (x44 + (x54 << 0x4));
				855	+ { u64 x72 = (x71 + (x54 << 0x1));
				856	+ { u64 x73 = (x72 + x54);
				857	+ { u64 x74 = (x43 + (x53 << 0x4));
				858	+ { u64 x75 = (x74 + (x53 << 0x1));
				859	+ { u64 x76 = (x75 + x53);
				860	+ { u64 x77 = (x42 + (x52 << 0x4));
				861	+ { u64 x78 = (x77 + (x52 << 0x1));
				862	+ { u64 x79 = (x78 + x52);
				863	+ { u64 x80 = (x41 + (x51 << 0x4));
				864	+ { u64 x81 = (x80 + (x51 << 0x1));
				865	+ { u64 x82 = (x81 + x51);
				866	+ { u64 x83 = (x40 + (x50 << 0x4));
				867	+ { u64 x84 = (x83 + (x50 << 0x1));
				868	+ { u64 x85 = (x84 + x50);
				869	+ { u64 x86 = (x85 >> 0x1a);
				870	+ { u32 x87 = ((u32)x85 & 0x3ffffff);
				871	+ { u64 x88 = (x86 + x82);
				872	+ { u64 x89 = (x88 >> 0x19);
				873	+ { u32 x90 = ((u32)x88 & 0x1ffffff);
				874	+ { u64 x91 = (x89 + x79);
				875	+ { u64 x92 = (x91 >> 0x1a);
				876	+ { u32 x93 = ((u32)x91 & 0x3ffffff);
				877	+ { u64 x94 = (x92 + x76);
				878	+ { u64 x95 = (x94 >> 0x19);
				879	+ { u32 x96 = ((u32)x94 & 0x1ffffff);
				880	+ { u64 x97 = (x95 + x73);
				881	+ { u64 x98 = (x97 >> 0x1a);
				882	+ { u32 x99 = ((u32)x97 & 0x3ffffff);
				883	+ { u64 x100 = (x98 + x70);
				884	+ { u64 x101 = (x100 >> 0x19);
				885	+ { u32 x102 = ((u32)x100 & 0x1ffffff);
				886	+ { u64 x103 = (x101 + x67);
				887	+ { u64 x104 = (x103 >> 0x1a);
				888	+ { u32 x105 = ((u32)x103 & 0x3ffffff);
				889	+ { u64 x106 = (x104 + x64);
				890	+ { u64 x107 = (x106 >> 0x19);
				891	+ { u32 x108 = ((u32)x106 & 0x1ffffff);
				892	+ { u64 x109 = (x107 + x61);
				893	+ { u64 x110 = (x109 >> 0x1a);
				894	+ { u32 x111 = ((u32)x109 & 0x3ffffff);
				895	+ { u64 x112 = (x110 + x49);
				896	+ { u64 x113 = (x112 >> 0x19);
				897	+ { u32 x114 = ((u32)x112 & 0x1ffffff);
				898	+ { u64 x115 = (x87 + (0x13 * x113));
				899	+ { u32 x116 = (u32) (x115 >> 0x1a);
				900	+ { u32 x117 = ((u32)x115 & 0x3ffffff);
				901	+ { u32 x118 = (x116 + x90);
				902	+ { u32 x119 = (x118 >> 0x19);
				903	+ { u32 x120 = (x118 & 0x1ffffff);
				904	+ out[0] = x117;
				905	+ out[1] = x120;
				906	+ out[2] = (x119 + x93);
				907	+ out[3] = x96;
				908	+ out[4] = x99;
				909	+ out[5] = x102;
				910	+ out[6] = x105;
				911	+ out[7] = x108;
				912	+ out[8] = x111;
				913	+ out[9] = x114;
				914	+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
				915	+}
				916	+
				917	+static __always_inline void fe_mul121666(fe h, const fe_loose f)
				918	+{
				919	+ fe_mul_121666_impl(h->v, f->v);
				920	+}
				921	+
				922	+void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
				923	+ const u8 scalar[CURVE25519_KEY_SIZE],
				924	+ const u8 point[CURVE25519_KEY_SIZE])
				925	+{
				926	+ fe x1, x2, z2, x3, z3;
				927	+ fe_loose x2l, z2l, x3l;
				928	+ unsigned swap = 0;
				929	+ int pos;
				930	+ u8 e[32];
				931	+
				932	+ memcpy(e, scalar, 32);
				933	+ curve25519_clamp_secret(e);
				934	+
				935	+ /* The following implementation was transcribed to Coq and proven to
				936	+ * correspond to unary scalar multiplication in affine coordinates given
				937	+ * that x1 != 0 is the x coordinate of some point on the curve. It was
				938	+ * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
				939	+ * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
				940	+ * quantified over the underlying field, so it applies to Curve25519
				941	+ * itself and the quadratic twist of Curve25519. It was not proven in
				942	+ * Coq that prime-field arithmetic correctly simulates extension-field
				943	+ * arithmetic on prime-field values. The decoding of the byte array
				944	+ * representation of e was not considered.
				945	+ *
				946	+ * Specification of Montgomery curves in affine coordinates:
				947	+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
				948	+ *
				949	+ * Proof that these form a group that is isomorphic to a Weierstrass
				950	+ * curve:
				951	+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
				952	+ *
				953	+ * Coq transcription and correctness proof of the loop
				954	+ * (where scalarbits=255):
				955	+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
				956	+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
				957	+ * preconditions: 0 <= e < 2^255 (not necessarily e < order),
				958	+ * fe_invert(0) = 0
				959	+ */
				960	+ fe_frombytes(&x1, point);
				961	+ fe_1(&x2);
				962	+ fe_0(&z2);
				963	+ fe_copy(&x3, &x1);
				964	+ fe_1(&z3);
				965	+
				966	+ for (pos = 254; pos >= 0; --pos) {
				967	+ fe tmp0, tmp1;
				968	+ fe_loose tmp0l, tmp1l;
				969	+ /* loop invariant as of right before the test, for the case
				970	+ * where x1 != 0:
				971	+ * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
				972	+ * is nonzero
				973	+ * let r := e >> (pos+1) in the following equalities of
				974	+ * projective points:
				975	+ * to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
				976	+ * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
				977	+ * x1 is the nonzero x coordinate of the nonzero
				978	+ * point (rP-(r+1)P)
				979	+ */
				980	+ unsigned b = 1 & (e[pos / 8] >> (pos & 7));
				981	+ swap ^= b;
				982	+ fe_cswap(&x2, &x3, swap);
				983	+ fe_cswap(&z2, &z3, swap);
				984	+ swap = b;
				985	+ /* Coq transcription of ladderstep formula (called from
				986	+ * transcribed loop):
				987	+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
				988	+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
				989	+ * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
				990	+ * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
				991	+ */
				992	+ fe_sub(&tmp0l, &x3, &z3);
				993	+ fe_sub(&tmp1l, &x2, &z2);
				994	+ fe_add(&x2l, &x2, &z2);
				995	+ fe_add(&z2l, &x3, &z3);
				996	+ fe_mul_tll(&z3, &tmp0l, &x2l);
				997	+ fe_mul_tll(&z2, &z2l, &tmp1l);
				998	+ fe_sq_tl(&tmp0, &tmp1l);
				999	+ fe_sq_tl(&tmp1, &x2l);
				1000	+ fe_add(&x3l, &z3, &z2);
				1001	+ fe_sub(&z2l, &z3, &z2);
				1002	+ fe_mul_ttt(&x2, &tmp1, &tmp0);
				1003	+ fe_sub(&tmp1l, &tmp1, &tmp0);
				1004	+ fe_sq_tl(&z2, &z2l);
				1005	+ fe_mul121666(&z3, &tmp1l);
				1006	+ fe_sq_tl(&x3, &x3l);
				1007	+ fe_add(&tmp0l, &tmp0, &z3);
				1008	+ fe_mul_ttt(&z3, &x1, &z2);
				1009	+ fe_mul_tll(&z2, &tmp1l, &tmp0l);
				1010	+ }
				1011	+ /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
				1012	+ * else (x2, z2)
				1013	+ */
				1014	+ fe_cswap(&x2, &x3, swap);
				1015	+ fe_cswap(&z2, &z3, swap);
				1016	+
				1017	+ fe_invert(&z2, &z2);
				1018	+ fe_mul_ttt(&x2, &x2, &z2);
				1019	+ fe_tobytes(out, &x2);
				1020	+
				1021	+ memzero_explicit(&x1, sizeof(x1));
				1022	+ memzero_explicit(&x2, sizeof(x2));
				1023	+ memzero_explicit(&z2, sizeof(z2));
				1024	+ memzero_explicit(&x3, sizeof(x3));
				1025	+ memzero_explicit(&z3, sizeof(z3));
				1026	+ memzero_explicit(&x2l, sizeof(x2l));
				1027	+ memzero_explicit(&z2l, sizeof(z2l));
				1028	+ memzero_explicit(&x3l, sizeof(x3l));
				1029	+ memzero_explicit(&e, sizeof(e));
				1030	+}
				1031	--- /dev/null
				1032	+++ b/lib/crypto/curve25519-hacl64.c
				1033	@@ -0,0 +1,788 @@
				1034	+// SPDX-License-Identifier: GPL-2.0 OR MIT
				1035	+/*
				1036	+ * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
				1037	+ * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				1038	+ *
				1039	+ * This is a machine-generated formally verified implementation of Curve25519
				1040	+ * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
				1041	+ * generated, it has been tweaked to be suitable for use in the kernel. It is
				1042	+ * optimized for 64-bit machines that can efficiently work with 128-bit
				1043	+ * integer types.
				1044	+ */
				1045	+
				1046	+#include <asm/unaligned.h>
				1047	+#include <crypto/curve25519.h>
				1048	+#include <linux/string.h>
				1049	+
				1050	+typedef __uint128_t u128;
				1051	+
				1052	+static __always_inline u64 u64_eq_mask(u64 a, u64 b)
				1053	+{
				1054	+ u64 x = a ^ b;
				1055	+ u64 minus_x = ~x + (u64)1U;
				1056	+ u64 x_or_minus_x = x \| minus_x;
				1057	+ u64 xnx = x_or_minus_x >> (u32)63U;
				1058	+ u64 c = xnx - (u64)1U;
				1059	+ return c;
				1060	+}
				1061	+
				1062	+static __always_inline u64 u64_gte_mask(u64 a, u64 b)
				1063	+{
				1064	+ u64 x = a;
				1065	+ u64 y = b;
				1066	+ u64 x_xor_y = x ^ y;
				1067	+ u64 x_sub_y = x - y;
				1068	+ u64 x_sub_y_xor_y = x_sub_y ^ y;
				1069	+ u64 q = x_xor_y \| x_sub_y_xor_y;
				1070	+ u64 x_xor_q = x ^ q;
				1071	+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
				1072	+ u64 c = x_xor_q_ - (u64)1U;
				1073	+ return c;
				1074	+}
				1075	+
				1076	+static __always_inline void modulo_carry_top(u64 *b)
				1077	+{
				1078	+ u64 b4 = b[4];
				1079	+ u64 b0 = b[0];
				1080	+ u64 b4_ = b4 & 0x7ffffffffffffLLU;
				1081	+ u64 b0_ = b0 + 19 * (b4 >> 51);
				1082	+ b[4] = b4_;
				1083	+ b[0] = b0_;
				1084	+}
				1085	+
				1086	+static __always_inline void fproduct_copy_from_wide_(u64 output, u128 input)
				1087	+{
				1088	+ {
				1089	+ u128 xi = input[0];
				1090	+ output[0] = ((u64)(xi));
				1091	+ }
				1092	+ {
				1093	+ u128 xi = input[1];
				1094	+ output[1] = ((u64)(xi));
				1095	+ }
				1096	+ {
				1097	+ u128 xi = input[2];
				1098	+ output[2] = ((u64)(xi));
				1099	+ }
				1100	+ {
				1101	+ u128 xi = input[3];
				1102	+ output[3] = ((u64)(xi));
				1103	+ }
				1104	+ {
				1105	+ u128 xi = input[4];
				1106	+ output[4] = ((u64)(xi));
				1107	+ }
				1108	+}
				1109	+
				1110	+static __always_inline void
				1111	+fproduct_sum_scalar_multiplication_(u128 output, u64 input, u64 s)
				1112	+{
				1113	+ output[0] += (u128)input[0] * s;
				1114	+ output[1] += (u128)input[1] * s;
				1115	+ output[2] += (u128)input[2] * s;
				1116	+ output[3] += (u128)input[3] * s;
				1117	+ output[4] += (u128)input[4] * s;
				1118	+}
				1119	+
				1120	+static __always_inline void fproduct_carry_wide_(u128 *tmp)
				1121	+{
				1122	+ {
				1123	+ u32 ctr = 0;
				1124	+ u128 tctr = tmp[ctr];
				1125	+ u128 tctrp1 = tmp[ctr + 1];
				1126	+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				1127	+ u128 c = ((tctr) >> (51));
				1128	+ tmp[ctr] = ((u128)(r0));
				1129	+ tmp[ctr + 1] = ((tctrp1) + (c));
				1130	+ }
				1131	+ {
				1132	+ u32 ctr = 1;
				1133	+ u128 tctr = tmp[ctr];
				1134	+ u128 tctrp1 = tmp[ctr + 1];
				1135	+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				1136	+ u128 c = ((tctr) >> (51));
				1137	+ tmp[ctr] = ((u128)(r0));
				1138	+ tmp[ctr + 1] = ((tctrp1) + (c));
				1139	+ }
				1140	+
				1141	+ {
				1142	+ u32 ctr = 2;
				1143	+ u128 tctr = tmp[ctr];
				1144	+ u128 tctrp1 = tmp[ctr + 1];
				1145	+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				1146	+ u128 c = ((tctr) >> (51));
				1147	+ tmp[ctr] = ((u128)(r0));
				1148	+ tmp[ctr + 1] = ((tctrp1) + (c));
				1149	+ }
				1150	+ {
				1151	+ u32 ctr = 3;
				1152	+ u128 tctr = tmp[ctr];
				1153	+ u128 tctrp1 = tmp[ctr + 1];
				1154	+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
				1155	+ u128 c = ((tctr) >> (51));
				1156	+ tmp[ctr] = ((u128)(r0));
				1157	+ tmp[ctr + 1] = ((tctrp1) + (c));
				1158	+ }
				1159	+}
				1160	+
				1161	+static __always_inline void fmul_shift_reduce(u64 *output)
				1162	+{
				1163	+ u64 tmp = output[4];
				1164	+ u64 b0;
				1165	+ {
				1166	+ u32 ctr = 5 - 0 - 1;
				1167	+ u64 z = output[ctr - 1];
				1168	+ output[ctr] = z;
				1169	+ }
				1170	+ {
				1171	+ u32 ctr = 5 - 1 - 1;
				1172	+ u64 z = output[ctr - 1];
				1173	+ output[ctr] = z;
				1174	+ }
				1175	+ {
				1176	+ u32 ctr = 5 - 2 - 1;
				1177	+ u64 z = output[ctr - 1];
				1178	+ output[ctr] = z;
				1179	+ }
				1180	+ {
				1181	+ u32 ctr = 5 - 3 - 1;
				1182	+ u64 z = output[ctr - 1];
				1183	+ output[ctr] = z;
				1184	+ }
				1185	+ output[0] = tmp;
				1186	+ b0 = output[0];
				1187	+ output[0] = 19 * b0;
				1188	+}
				1189	+
				1190	+static __always_inline void fmul_mul_shift_reduce_(u128 output, u64 input,
				1191	+ u64 *input21)
				1192	+{
				1193	+ u32 i;
				1194	+ u64 input2i;
				1195	+ {
				1196	+ u64 input2i = input21[0];
				1197	+ fproduct_sum_scalar_multiplication_(output, input, input2i);
				1198	+ fmul_shift_reduce(input);
				1199	+ }
				1200	+ {
				1201	+ u64 input2i = input21[1];
				1202	+ fproduct_sum_scalar_multiplication_(output, input, input2i);
				1203	+ fmul_shift_reduce(input);
				1204	+ }
				1205	+ {
				1206	+ u64 input2i = input21[2];
				1207	+ fproduct_sum_scalar_multiplication_(output, input, input2i);
				1208	+ fmul_shift_reduce(input);
				1209	+ }
				1210	+ {
				1211	+ u64 input2i = input21[3];
				1212	+ fproduct_sum_scalar_multiplication_(output, input, input2i);
				1213	+ fmul_shift_reduce(input);
				1214	+ }
				1215	+ i = 4;
				1216	+ input2i = input21[i];
				1217	+ fproduct_sum_scalar_multiplication_(output, input, input2i);
				1218	+}
				1219	+
				1220	+static __always_inline void fmul_fmul(u64 output, u64 input, u64 *input21)
				1221	+{
				1222	+ u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
				1223	+ {
				1224	+ u128 b4;
				1225	+ u128 b0;
				1226	+ u128 b4_;
				1227	+ u128 b0_;
				1228	+ u64 i0;
				1229	+ u64 i1;
				1230	+ u64 i0_;
				1231	+ u64 i1_;
				1232	+ u128 t[5] = { 0 };
				1233	+ fmul_mul_shift_reduce_(t, tmp, input21);
				1234	+ fproduct_carry_wide_(t);
				1235	+ b4 = t[4];
				1236	+ b0 = t[0];
				1237	+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
				1238	+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
				1239	+ t[4] = b4_;
				1240	+ t[0] = b0_;
				1241	+ fproduct_copy_from_wide_(output, t);
				1242	+ i0 = output[0];
				1243	+ i1 = output[1];
				1244	+ i0_ = i0 & 0x7ffffffffffffLLU;
				1245	+ i1_ = i1 + (i0 >> 51);
				1246	+ output[0] = i0_;
				1247	+ output[1] = i1_;
				1248	+ }
				1249	+}
				1250	+
				1251	+static __always_inline void fsquare_fsquare__(u128 tmp, u64 output)
				1252	+{
				1253	+ u64 r0 = output[0];
				1254	+ u64 r1 = output[1];
				1255	+ u64 r2 = output[2];
				1256	+ u64 r3 = output[3];
				1257	+ u64 r4 = output[4];
				1258	+ u64 d0 = r0 * 2;
				1259	+ u64 d1 = r1 * 2;
				1260	+ u64 d2 = r2 * 2 * 19;
				1261	+ u64 d419 = r4 * 19;
				1262	+ u64 d4 = d419 * 2;
				1263	+ u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
				1264	+ (((u128)(d2) * (r3))));
				1265	+ u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
				1266	+ (((u128)(r3 * 19) * (r3))));
				1267	+ u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
				1268	+ (((u128)(d4) * (r3))));
				1269	+ u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
				1270	+ (((u128)(r4) * (d419))));
				1271	+ u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
				1272	+ (((u128)(r2) * (r2))));
				1273	+ tmp[0] = s0;
				1274	+ tmp[1] = s1;
				1275	+ tmp[2] = s2;
				1276	+ tmp[3] = s3;
				1277	+ tmp[4] = s4;
				1278	+}
				1279	+
				1280	+static __always_inline void fsquare_fsquare_(u128 tmp, u64 output)
				1281	+{
				1282	+ u128 b4;
				1283	+ u128 b0;
				1284	+ u128 b4_;
				1285	+ u128 b0_;
				1286	+ u64 i0;
				1287	+ u64 i1;
				1288	+ u64 i0_;
				1289	+ u64 i1_;
				1290	+ fsquare_fsquare__(tmp, output);
				1291	+ fproduct_carry_wide_(tmp);
				1292	+ b4 = tmp[4];
				1293	+ b0 = tmp[0];
				1294	+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
				1295	+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
				1296	+ tmp[4] = b4_;
				1297	+ tmp[0] = b0_;
				1298	+ fproduct_copy_from_wide_(output, tmp);
				1299	+ i0 = output[0];
				1300	+ i1 = output[1];
				1301	+ i0_ = i0 & 0x7ffffffffffffLLU;
				1302	+ i1_ = i1 + (i0 >> 51);
				1303	+ output[0] = i0_;
				1304	+ output[1] = i1_;
				1305	+}
				1306	+
				1307	+static __always_inline void fsquare_fsquare_times_(u64 output, u128 tmp,
				1308	+ u32 count1)
				1309	+{
				1310	+ u32 i;
				1311	+ fsquare_fsquare_(tmp, output);
				1312	+ for (i = 1; i < count1; ++i)
				1313	+ fsquare_fsquare_(tmp, output);
				1314	+}
				1315	+
				1316	+static __always_inline void fsquare_fsquare_times(u64 output, u64 input,
				1317	+ u32 count1)
				1318	+{
				1319	+ u128 t[5];
				1320	+ memcpy(output, input, 5 * sizeof(*input));
				1321	+ fsquare_fsquare_times_(output, t, count1);
				1322	+}
				1323	+
				1324	+static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
				1325	+ u32 count1)
				1326	+{
				1327	+ u128 t[5];
				1328	+ fsquare_fsquare_times_(output, t, count1);
				1329	+}
				1330	+
				1331	+static __always_inline void crecip_crecip(u64 out, u64 z)
				1332	+{
				1333	+ u64 buf[20] = { 0 };
				1334	+ u64 *a0 = buf;
				1335	+ u64 *t00 = buf + 5;
				1336	+ u64 *b0 = buf + 10;
				1337	+ u64 *t01;
				1338	+ u64 *b1;
				1339	+ u64 *c0;
				1340	+ u64 *a;
				1341	+ u64 *t0;
				1342	+ u64 *b;
				1343	+ u64 *c;
				1344	+ fsquare_fsquare_times(a0, z, 1);
				1345	+ fsquare_fsquare_times(t00, a0, 2);
				1346	+ fmul_fmul(b0, t00, z);
				1347	+ fmul_fmul(a0, b0, a0);
				1348	+ fsquare_fsquare_times(t00, a0, 1);
				1349	+ fmul_fmul(b0, t00, b0);
				1350	+ fsquare_fsquare_times(t00, b0, 5);
				1351	+ t01 = buf + 5;
				1352	+ b1 = buf + 10;
				1353	+ c0 = buf + 15;
				1354	+ fmul_fmul(b1, t01, b1);
				1355	+ fsquare_fsquare_times(t01, b1, 10);
				1356	+ fmul_fmul(c0, t01, b1);
				1357	+ fsquare_fsquare_times(t01, c0, 20);
				1358	+ fmul_fmul(t01, t01, c0);
				1359	+ fsquare_fsquare_times_inplace(t01, 10);
				1360	+ fmul_fmul(b1, t01, b1);
				1361	+ fsquare_fsquare_times(t01, b1, 50);
				1362	+ a = buf;
				1363	+ t0 = buf + 5;
				1364	+ b = buf + 10;
				1365	+ c = buf + 15;
				1366	+ fmul_fmul(c, t0, b);
				1367	+ fsquare_fsquare_times(t0, c, 100);
				1368	+ fmul_fmul(t0, t0, c);
				1369	+ fsquare_fsquare_times_inplace(t0, 50);
				1370	+ fmul_fmul(t0, t0, b);
				1371	+ fsquare_fsquare_times_inplace(t0, 5);
				1372	+ fmul_fmul(out, t0, a);
				1373	+}
				1374	+
				1375	+static __always_inline void fsum(u64 a, u64 b)
				1376	+{
				1377	+ a[0] += b[0];
				1378	+ a[1] += b[1];
				1379	+ a[2] += b[2];
				1380	+ a[3] += b[3];
				1381	+ a[4] += b[4];
				1382	+}
				1383	+
				1384	+static __always_inline void fdifference(u64 a, u64 b)
				1385	+{
				1386	+ u64 tmp[5] = { 0 };
				1387	+ u64 b0;
				1388	+ u64 b1;
				1389	+ u64 b2;
				1390	+ u64 b3;
				1391	+ u64 b4;
				1392	+ memcpy(tmp, b, 5 * sizeof(*b));
				1393	+ b0 = tmp[0];
				1394	+ b1 = tmp[1];
				1395	+ b2 = tmp[2];
				1396	+ b3 = tmp[3];
				1397	+ b4 = tmp[4];
				1398	+ tmp[0] = b0 + 0x3fffffffffff68LLU;
				1399	+ tmp[1] = b1 + 0x3ffffffffffff8LLU;
				1400	+ tmp[2] = b2 + 0x3ffffffffffff8LLU;
				1401	+ tmp[3] = b3 + 0x3ffffffffffff8LLU;
				1402	+ tmp[4] = b4 + 0x3ffffffffffff8LLU;
				1403	+ {
				1404	+ u64 xi = a[0];
				1405	+ u64 yi = tmp[0];
				1406	+ a[0] = yi - xi;
				1407	+ }
				1408	+ {
				1409	+ u64 xi = a[1];
				1410	+ u64 yi = tmp[1];
				1411	+ a[1] = yi - xi;
				1412	+ }
				1413	+ {
				1414	+ u64 xi = a[2];
				1415	+ u64 yi = tmp[2];
				1416	+ a[2] = yi - xi;
				1417	+ }
				1418	+ {
				1419	+ u64 xi = a[3];
				1420	+ u64 yi = tmp[3];
				1421	+ a[3] = yi - xi;
				1422	+ }
				1423	+ {
				1424	+ u64 xi = a[4];
				1425	+ u64 yi = tmp[4];
				1426	+ a[4] = yi - xi;
				1427	+ }
				1428	+}
				1429	+
				1430	+static __always_inline void fscalar(u64 output, u64 b, u64 s)
				1431	+{
				1432	+ u128 tmp[5];
				1433	+ u128 b4;
				1434	+ u128 b0;
				1435	+ u128 b4_;
				1436	+ u128 b0_;
				1437	+ {
				1438	+ u64 xi = b[0];
				1439	+ tmp[0] = ((u128)(xi) * (s));
				1440	+ }
				1441	+ {
				1442	+ u64 xi = b[1];
				1443	+ tmp[1] = ((u128)(xi) * (s));
				1444	+ }
				1445	+ {
				1446	+ u64 xi = b[2];
				1447	+ tmp[2] = ((u128)(xi) * (s));
				1448	+ }
				1449	+ {
				1450	+ u64 xi = b[3];
				1451	+ tmp[3] = ((u128)(xi) * (s));
				1452	+ }
				1453	+ {
				1454	+ u64 xi = b[4];
				1455	+ tmp[4] = ((u128)(xi) * (s));
				1456	+ }
				1457	+ fproduct_carry_wide_(tmp);
				1458	+ b4 = tmp[4];
				1459	+ b0 = tmp[0];
				1460	+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
				1461	+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
				1462	+ tmp[4] = b4_;
				1463	+ tmp[0] = b0_;
				1464	+ fproduct_copy_from_wide_(output, tmp);
				1465	+}
				1466	+
				1467	+static __always_inline void fmul(u64 output, u64 a, u64 *b)
				1468	+{
				1469	+ fmul_fmul(output, a, b);
				1470	+}
				1471	+
				1472	+static __always_inline void crecip(u64 output, u64 input)
				1473	+{
				1474	+ crecip_crecip(output, input);
				1475	+}
				1476	+
				1477	+static __always_inline void point_swap_conditional_step(u64 a, u64 b,
				1478	+ u64 swap1, u32 ctr)
				1479	+{
				1480	+ u32 i = ctr - 1;
				1481	+ u64 ai = a[i];
				1482	+ u64 bi = b[i];
				1483	+ u64 x = swap1 & (ai ^ bi);
				1484	+ u64 ai1 = ai ^ x;
				1485	+ u64 bi1 = bi ^ x;
				1486	+ a[i] = ai1;
				1487	+ b[i] = bi1;
				1488	+}
				1489	+
				1490	+static __always_inline void point_swap_conditional5(u64 a, u64 b, u64 swap1)
				1491	+{
				1492	+ point_swap_conditional_step(a, b, swap1, 5);
				1493	+ point_swap_conditional_step(a, b, swap1, 4);
				1494	+ point_swap_conditional_step(a, b, swap1, 3);
				1495	+ point_swap_conditional_step(a, b, swap1, 2);
				1496	+ point_swap_conditional_step(a, b, swap1, 1);
				1497	+}
				1498	+
				1499	+static __always_inline void point_swap_conditional(u64 a, u64 b, u64 iswap)
				1500	+{
				1501	+ u64 swap1 = 0 - iswap;
				1502	+ point_swap_conditional5(a, b, swap1);
				1503	+ point_swap_conditional5(a + 5, b + 5, swap1);
				1504	+}
				1505	+
				1506	+static __always_inline void point_copy(u64 output, u64 input)
				1507	+{
				1508	+ memcpy(output, input, 5 * sizeof(*input));
				1509	+ memcpy(output + 5, input + 5, 5 * sizeof(*input));
				1510	+}
				1511	+
				1512	+static __always_inline void addanddouble_fmonty(u64 pp, u64 ppq, u64 *p,
				1513	+ u64 pq, u64 qmqp)
				1514	+{
				1515	+ u64 *qx = qmqp;
				1516	+ u64 *x2 = pp;
				1517	+ u64 *z2 = pp + 5;
				1518	+ u64 *x3 = ppq;
				1519	+ u64 *z3 = ppq + 5;
				1520	+ u64 *x = p;
				1521	+ u64 *z = p + 5;
				1522	+ u64 *xprime = pq;
				1523	+ u64 *zprime = pq + 5;
				1524	+ u64 buf[40] = { 0 };
				1525	+ u64 *origx = buf;
				1526	+ u64 *origxprime0 = buf + 5;
				1527	+ u64 *xxprime0;
				1528	+ u64 *zzprime0;
				1529	+ u64 *origxprime;
				1530	+ xxprime0 = buf + 25;
				1531	+ zzprime0 = buf + 30;
				1532	+ memcpy(origx, x, 5 * sizeof(*x));
				1533	+ fsum(x, z);
				1534	+ fdifference(z, origx);
				1535	+ memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
				1536	+ fsum(xprime, zprime);
				1537	+ fdifference(zprime, origxprime0);
				1538	+ fmul(xxprime0, xprime, z);
				1539	+ fmul(zzprime0, x, zprime);
				1540	+ origxprime = buf + 5;
				1541	+ {
				1542	+ u64 *xx0;
				1543	+ u64 *zz0;
				1544	+ u64 *xxprime;
				1545	+ u64 *zzprime;
				1546	+ u64 *zzzprime;
				1547	+ xx0 = buf + 15;
				1548	+ zz0 = buf + 20;
				1549	+ xxprime = buf + 25;
				1550	+ zzprime = buf + 30;
				1551	+ zzzprime = buf + 35;
				1552	+ memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
				1553	+ fsum(xxprime, zzprime);
				1554	+ fdifference(zzprime, origxprime);
				1555	+ fsquare_fsquare_times(x3, xxprime, 1);
				1556	+ fsquare_fsquare_times(zzzprime, zzprime, 1);
				1557	+ fmul(z3, zzzprime, qx);
				1558	+ fsquare_fsquare_times(xx0, x, 1);
				1559	+ fsquare_fsquare_times(zz0, z, 1);
				1560	+ {
				1561	+ u64 *zzz;
				1562	+ u64 *xx;
				1563	+ u64 *zz;
				1564	+ u64 scalar;
				1565	+ zzz = buf + 10;
				1566	+ xx = buf + 15;
				1567	+ zz = buf + 20;
				1568	+ fmul(x2, xx, zz);
				1569	+ fdifference(zz, xx);
				1570	+ scalar = 121665;
				1571	+ fscalar(zzz, zz, scalar);
				1572	+ fsum(zzz, xx);
				1573	+ fmul(z2, zzz, zz);
				1574	+ }
				1575	+ }
				1576	+}
				1577	+
				1578	+static __always_inline void
				1579	+ladder_smallloop_cmult_small_loop_step(u64 nq, u64 nqpq, u64 nq2, u64 nqpq2,
				1580	+ u64 *q, u8 byt)
				1581	+{
				1582	+ u64 bit0 = (u64)(byt >> 7);
				1583	+ u64 bit;
				1584	+ point_swap_conditional(nq, nqpq, bit0);
				1585	+ addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
				1586	+ bit = (u64)(byt >> 7);
				1587	+ point_swap_conditional(nq2, nqpq2, bit);
				1588	+}
				1589	+
				1590	+static __always_inline void
				1591	+ladder_smallloop_cmult_small_loop_double_step(u64 nq, u64 nqpq, u64 *nq2,
				1592	+ u64 nqpq2, u64 q, u8 byt)
				1593	+{
				1594	+ u8 byt1;
				1595	+ ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
				1596	+ byt1 = byt << 1;
				1597	+ ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
				1598	+}
				1599	+
				1600	+static __always_inline void
				1601	+ladder_smallloop_cmult_small_loop(u64 nq, u64 nqpq, u64 nq2, u64 nqpq2,
				1602	+ u64 *q, u8 byt, u32 i)
				1603	+{
				1604	+ while (i--) {
				1605	+ ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
				1606	+ nqpq2, q, byt);
				1607	+ byt <<= 2;
				1608	+ }
				1609	+}
				1610	+
				1611	+static __always_inline void ladder_bigloop_cmult_big_loop(u8 n1, u64 nq,
				1612	+ u64 nqpq, u64 nq2,
				1613	+ u64 nqpq2, u64 q,
				1614	+ u32 i)
				1615	+{
				1616	+ while (i--) {
				1617	+ u8 byte = n1[i];
				1618	+ ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
				1619	+ byte, 4);
				1620	+ }
				1621	+}
				1622	+
				1623	+static void ladder_cmult(u64 result, u8 n1, u64 *q)
				1624	+{
				1625	+ u64 point_buf[40] = { 0 };
				1626	+ u64 *nq = point_buf;
				1627	+ u64 *nqpq = point_buf + 10;
				1628	+ u64 *nq2 = point_buf + 20;
				1629	+ u64 *nqpq2 = point_buf + 30;
				1630	+ point_copy(nqpq, q);
				1631	+ nq[0] = 1;
				1632	+ ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
				1633	+ point_copy(result, nq);
				1634	+}
				1635	+
				1636	+static __always_inline void format_fexpand(u64 output, const u8 input)
				1637	+{
				1638	+ const u8 *x00 = input + 6;
				1639	+ const u8 *x01 = input + 12;
				1640	+ const u8 *x02 = input + 19;
				1641	+ const u8 *x0 = input + 24;
				1642	+ u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
				1643	+ i0 = get_unaligned_le64(input);
				1644	+ i1 = get_unaligned_le64(x00);
				1645	+ i2 = get_unaligned_le64(x01);
				1646	+ i3 = get_unaligned_le64(x02);
				1647	+ i4 = get_unaligned_le64(x0);
				1648	+ output0 = i0 & 0x7ffffffffffffLLU;
				1649	+ output1 = i1 >> 3 & 0x7ffffffffffffLLU;
				1650	+ output2 = i2 >> 6 & 0x7ffffffffffffLLU;
				1651	+ output3 = i3 >> 1 & 0x7ffffffffffffLLU;
				1652	+ output4 = i4 >> 12 & 0x7ffffffffffffLLU;
				1653	+ output[0] = output0;
				1654	+ output[1] = output1;
				1655	+ output[2] = output2;
				1656	+ output[3] = output3;
				1657	+ output[4] = output4;
				1658	+}
				1659	+
				1660	+static __always_inline void format_fcontract_first_carry_pass(u64 *input)
				1661	+{
				1662	+ u64 t0 = input[0];
				1663	+ u64 t1 = input[1];
				1664	+ u64 t2 = input[2];
				1665	+ u64 t3 = input[3];
				1666	+ u64 t4 = input[4];
				1667	+ u64 t1_ = t1 + (t0 >> 51);
				1668	+ u64 t0_ = t0 & 0x7ffffffffffffLLU;
				1669	+ u64 t2_ = t2 + (t1_ >> 51);
				1670	+ u64 t1__ = t1_ & 0x7ffffffffffffLLU;
				1671	+ u64 t3_ = t3 + (t2_ >> 51);
				1672	+ u64 t2__ = t2_ & 0x7ffffffffffffLLU;
				1673	+ u64 t4_ = t4 + (t3_ >> 51);
				1674	+ u64 t3__ = t3_ & 0x7ffffffffffffLLU;
				1675	+ input[0] = t0_;
				1676	+ input[1] = t1__;
				1677	+ input[2] = t2__;
				1678	+ input[3] = t3__;
				1679	+ input[4] = t4_;
				1680	+}
				1681	+
				1682	+static __always_inline void format_fcontract_first_carry_full(u64 *input)
				1683	+{
				1684	+ format_fcontract_first_carry_pass(input);
				1685	+ modulo_carry_top(input);
				1686	+}
				1687	+
				1688	+static __always_inline void format_fcontract_second_carry_pass(u64 *input)
				1689	+{
				1690	+ u64 t0 = input[0];
				1691	+ u64 t1 = input[1];
				1692	+ u64 t2 = input[2];
				1693	+ u64 t3 = input[3];
				1694	+ u64 t4 = input[4];
				1695	+ u64 t1_ = t1 + (t0 >> 51);
				1696	+ u64 t0_ = t0 & 0x7ffffffffffffLLU;
				1697	+ u64 t2_ = t2 + (t1_ >> 51);
				1698	+ u64 t1__ = t1_ & 0x7ffffffffffffLLU;
				1699	+ u64 t3_ = t3 + (t2_ >> 51);
				1700	+ u64 t2__ = t2_ & 0x7ffffffffffffLLU;
				1701	+ u64 t4_ = t4 + (t3_ >> 51);
				1702	+ u64 t3__ = t3_ & 0x7ffffffffffffLLU;
				1703	+ input[0] = t0_;
				1704	+ input[1] = t1__;
				1705	+ input[2] = t2__;
				1706	+ input[3] = t3__;
				1707	+ input[4] = t4_;
				1708	+}
				1709	+
				1710	+static __always_inline void format_fcontract_second_carry_full(u64 *input)
				1711	+{
				1712	+ u64 i0;
				1713	+ u64 i1;
				1714	+ u64 i0_;
				1715	+ u64 i1_;
				1716	+ format_fcontract_second_carry_pass(input);
				1717	+ modulo_carry_top(input);
				1718	+ i0 = input[0];
				1719	+ i1 = input[1];
				1720	+ i0_ = i0 & 0x7ffffffffffffLLU;
				1721	+ i1_ = i1 + (i0 >> 51);
				1722	+ input[0] = i0_;
				1723	+ input[1] = i1_;
				1724	+}
				1725	+
				1726	+static __always_inline void format_fcontract_trim(u64 *input)
				1727	+{
				1728	+ u64 a0 = input[0];
				1729	+ u64 a1 = input[1];
				1730	+ u64 a2 = input[2];
				1731	+ u64 a3 = input[3];
				1732	+ u64 a4 = input[4];
				1733	+ u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
				1734	+ u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
				1735	+ u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
				1736	+ u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
				1737	+ u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
				1738	+ u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
				1739	+ u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
				1740	+ u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
				1741	+ u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
				1742	+ u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
				1743	+ u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
				1744	+ input[0] = a0_;
				1745	+ input[1] = a1_;
				1746	+ input[2] = a2_;
				1747	+ input[3] = a3_;
				1748	+ input[4] = a4_;
				1749	+}
				1750	+
				1751	+static __always_inline void format_fcontract_store(u8 output, u64 input)
				1752	+{
				1753	+ u64 t0 = input[0];
				1754	+ u64 t1 = input[1];
				1755	+ u64 t2 = input[2];
				1756	+ u64 t3 = input[3];
				1757	+ u64 t4 = input[4];
				1758	+ u64 o0 = t1 << 51 \| t0;
				1759	+ u64 o1 = t2 << 38 \| t1 >> 13;
				1760	+ u64 o2 = t3 << 25 \| t2 >> 26;
				1761	+ u64 o3 = t4 << 12 \| t3 >> 39;
				1762	+ u8 *b0 = output;
				1763	+ u8 *b1 = output + 8;
				1764	+ u8 *b2 = output + 16;
				1765	+ u8 *b3 = output + 24;
				1766	+ put_unaligned_le64(o0, b0);
				1767	+ put_unaligned_le64(o1, b1);
				1768	+ put_unaligned_le64(o2, b2);
				1769	+ put_unaligned_le64(o3, b3);
				1770	+}
				1771	+
				1772	+static __always_inline void format_fcontract(u8 output, u64 input)
				1773	+{
				1774	+ format_fcontract_first_carry_full(input);
				1775	+ format_fcontract_second_carry_full(input);
				1776	+ format_fcontract_trim(input);
				1777	+ format_fcontract_store(output, input);
				1778	+}
				1779	+
				1780	+static __always_inline void format_scalar_of_point(u8 scalar, u64 point)
				1781	+{
				1782	+ u64 *x = point;
				1783	+ u64 *z = point + 5;
				1784	+ u64 buf[10] __aligned(32) = { 0 };
				1785	+ u64 *zmone = buf;
				1786	+ u64 *sc = buf + 5;
				1787	+ crecip(zmone, z);
				1788	+ fmul(sc, x, zmone);
				1789	+ format_fcontract(scalar, sc);
				1790	+}
				1791	+
				1792	+void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
				1793	+ const u8 secret[CURVE25519_KEY_SIZE],
				1794	+ const u8 basepoint[CURVE25519_KEY_SIZE])
				1795	+{
				1796	+ u64 buf0[10] __aligned(32) = { 0 };
				1797	+ u64 *x0 = buf0;
				1798	+ u64 *z = buf0 + 5;
				1799	+ u64 *q;
				1800	+ format_fexpand(x0, basepoint);
				1801	+ z[0] = 1;
				1802	+ q = buf0;
				1803	+ {
				1804	+ u8 e[32] __aligned(32) = { 0 };
				1805	+ u8 *scalar;
				1806	+ memcpy(e, secret, 32);
				1807	+ curve25519_clamp_secret(e);
				1808	+ scalar = e;
				1809	+ {
				1810	+ u64 buf[15] = { 0 };
				1811	+ u64 *nq = buf;
				1812	+ u64 *x = nq;
				1813	+ x[0] = 1;
				1814	+ ladder_cmult(nq, scalar, q);
				1815	+ format_scalar_of_point(mypublic, nq);
				1816	+ memzero_explicit(buf, sizeof(buf));
				1817	+ }
				1818	+ memzero_explicit(e, sizeof(e));
				1819	+ }
				1820	+ memzero_explicit(buf0, sizeof(buf0));
				1821	+}
				1822	--- /dev/null
				1823	+++ b/lib/crypto/curve25519.c
				1824	@@ -0,0 +1,25 @@
				1825	+// SPDX-License-Identifier: GPL-2.0 OR MIT
				1826	+/*
				1827	+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
				1828	+ *
				1829	+ * This is an implementation of the Curve25519 ECDH algorithm, using either
				1830	+ * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
				1831	+ * depending on what is supported by the target compiler.
				1832	+ *
				1833	+ * Information: https://cr.yp.to/ecdh.html
				1834	+ */
				1835	+
				1836	+#include <crypto/curve25519.h>
				1837	+#include <linux/module.h>
				1838	+#include <linux/init.h>
				1839	+
				1840	+const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
				1841	+const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
				1842	+
				1843	+EXPORT_SYMBOL(curve25519_null_point);
				1844	+EXPORT_SYMBOL(curve25519_base_point);
				1845	+EXPORT_SYMBOL(curve25519_generic);
				1846	+
				1847	+MODULE_LICENSE("GPL v2");
				1848	+MODULE_DESCRIPTION("Curve25519 scalar multiplication");
				1849	+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");