Blame - ap/lib/libssl/openssl-1.1.1o/crypto/ec/ecp_nistp521.c - R306

blob: 08b32787293b8c1e8c13a44a2b2aac50b0e00bc3 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame^]	1	/*
				2	* Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	*
				4	* Licensed under the OpenSSL license (the "License"). You may not use
				5	* this file except in compliance with the License. You can obtain a copy
				6	* in the file LICENSE in the source distribution or at
				7	* https://www.openssl.org/source/license.html
				8	*/
				9
				10	/* Copyright 2011 Google Inc.
				11	*
				12	* Licensed under the Apache License, Version 2.0 (the "License");
				13	*
				14	* you may not use this file except in compliance with the License.
				15	* You may obtain a copy of the License at
				16	*
				17	* http://www.apache.org/licenses/LICENSE-2.0
				18	*
				19	* Unless required by applicable law or agreed to in writing, software
				20	* distributed under the License is distributed on an "AS IS" BASIS,
				21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				22	* See the License for the specific language governing permissions and
				23	* limitations under the License.
				24	*/
				25
				26	/*
				27	* A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
				28	*
				29	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
				30	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
				31	* work which got its smarts from Daniel J. Bernstein's work on the same.
				32	*/
				33
				34	#include <openssl/e_os2.h>
				35	#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
				36	NON_EMPTY_TRANSLATION_UNIT
				37	#else
				38
				39	# include <string.h>
				40	# include <openssl/err.h>
				41	# include "ec_local.h"
				42
				43	# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
				44	/* even with gcc, the typedef won't work for 32-bit platforms */
				45	typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
				46	* platforms */
				47	# else
				48	# error "Your compiler doesn't appear to support 128-bit integer types"
				49	# endif
				50
				51	typedef uint8_t u8;
				52	typedef uint64_t u64;
				53
				54	/*
				55	* The underlying field. P521 operates over GF(2^521-1). We can serialise an
				56	* element of this field into 66 bytes where the most significant byte
				57	* contains only a single bit. We call this an felem_bytearray.
				58	*/
				59
				60	typedef u8 felem_bytearray[66];
				61
				62	/*
				63	* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
				64	* These values are big-endian.
				65	*/
				66	static const felem_bytearray nistp521_curve_params[5] = {
				67	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
				68	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				69	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				70	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				71	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				72	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				73	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				74	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				75	0xff, 0xff},
				76	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
				77	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				78	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				79	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				80	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				81	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				82	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				83	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				84	0xff, 0xfc},
				85	{0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
				86	0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
				87	0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
				88	0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
				89	0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
				90	0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
				91	0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
				92	0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
				93	0x3f, 0x00},
				94	{0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
				95	0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
				96	0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
				97	0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
				98	0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
				99	0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
				100	0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
				101	0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
				102	0xbd, 0x66},
				103	{0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
				104	0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
				105	0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
				106	0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
				107	0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
				108	0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
				109	0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
				110	0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
				111	0x66, 0x50}
				112	};
				113
				114	/*-
				115	* The representation of field elements.
				116	* ------------------------------------
				117	*
				118	* We represent field elements with nine values. These values are either 64 or
				119	* 128 bits and the field element represented is:
				120	* v[0]2^0 + v[1]2^58 + v[2]2^116 + ... + v[8]2^464 (mod p)
				121	* Each of the nine values is called a 'limb'. Since the limbs are spaced only
				122	* 58 bits apart, but are greater than 58 bits in length, the most significant
				123	* bits of each limb overlap with the least significant bits of the next.
				124	*
				125	* A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
				126	* 'largefelem' */
				127
				128	# define NLIMBS 9
				129
				130	typedef uint64_t limb;
				131	typedef limb limb_aX __attribute((__aligned__(1)));
				132	typedef limb felem[NLIMBS];
				133	typedef uint128_t largefelem[NLIMBS];
				134
				135	static const limb bottom57bits = 0x1ffffffffffffff;
				136	static const limb bottom58bits = 0x3ffffffffffffff;
				137
				138	/*
				139	* bin66_to_felem takes a little-endian byte array and converts it into felem
				140	* form. This assumes that the CPU is little-endian.
				141	*/
				142	static void bin66_to_felem(felem out, const u8 in[66])
				143	{
				144	out[0] = (((limb ) & in[0])) & bottom58bits;
				145	out[1] = (((limb_aX ) & in[7]) >> 2) & bottom58bits;
				146	out[2] = (((limb_aX ) & in[14]) >> 4) & bottom58bits;
				147	out[3] = (((limb_aX ) & in[21]) >> 6) & bottom58bits;
				148	out[4] = (((limb_aX ) & in[29])) & bottom58bits;
				149	out[5] = (((limb_aX ) & in[36]) >> 2) & bottom58bits;
				150	out[6] = (((limb_aX ) & in[43]) >> 4) & bottom58bits;
				151	out[7] = (((limb_aX ) & in[50]) >> 6) & bottom58bits;
				152	out[8] = (((limb_aX ) & in[58])) & bottom57bits;
				153	}
				154
				155	/*
				156	* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
				157	* array. This assumes that the CPU is little-endian.
				158	*/
				159	static void felem_to_bin66(u8 out[66], const felem in)
				160	{
				161	memset(out, 0, 66);
				162	(((limb ) & out[0])) = in[0];
				163	(((limb_aX ) & out[7])) \|= in[1] << 2;
				164	(((limb_aX ) & out[14])) \|= in[2] << 4;
				165	(((limb_aX ) & out[21])) \|= in[3] << 6;
				166	(((limb_aX ) & out[29])) = in[4];
				167	(((limb_aX ) & out[36])) \|= in[5] << 2;
				168	(((limb_aX ) & out[43])) \|= in[6] << 4;
				169	(((limb_aX ) & out[50])) \|= in[7] << 6;
				170	(((limb_aX ) & out[58])) = in[8];
				171	}
				172
				173	/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
				174	static int BN_to_felem(felem out, const BIGNUM *bn)
				175	{
				176	felem_bytearray b_out;
				177	int num_bytes;
				178
				179	if (BN_is_negative(bn)) {
				180	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
				181	return 0;
				182	}
				183	num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
				184	if (num_bytes < 0) {
				185	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
				186	return 0;
				187	}
				188	bin66_to_felem(out, b_out);
				189	return 1;
				190	}
				191
				192	/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
				193	static BIGNUM felem_to_BN(BIGNUM out, const felem in)
				194	{
				195	felem_bytearray b_out;
				196	felem_to_bin66(b_out, in);
				197	return BN_lebin2bn(b_out, sizeof(b_out), out);
				198	}
				199
				200	/*-
				201	* Field operations
				202	* ----------------
				203	*/
				204
				205	static void felem_one(felem out)
				206	{
				207	out[0] = 1;
				208	out[1] = 0;
				209	out[2] = 0;
				210	out[3] = 0;
				211	out[4] = 0;
				212	out[5] = 0;
				213	out[6] = 0;
				214	out[7] = 0;
				215	out[8] = 0;
				216	}
				217
				218	static void felem_assign(felem out, const felem in)
				219	{
				220	out[0] = in[0];
				221	out[1] = in[1];
				222	out[2] = in[2];
				223	out[3] = in[3];
				224	out[4] = in[4];
				225	out[5] = in[5];
				226	out[6] = in[6];
				227	out[7] = in[7];
				228	out[8] = in[8];
				229	}
				230
				231	/* felem_sum64 sets out = out + in. */
				232	static void felem_sum64(felem out, const felem in)
				233	{
				234	out[0] += in[0];
				235	out[1] += in[1];
				236	out[2] += in[2];
				237	out[3] += in[3];
				238	out[4] += in[4];
				239	out[5] += in[5];
				240	out[6] += in[6];
				241	out[7] += in[7];
				242	out[8] += in[8];
				243	}
				244
				245	/* felem_scalar sets out = in * scalar */
				246	static void felem_scalar(felem out, const felem in, limb scalar)
				247	{
				248	out[0] = in[0] * scalar;
				249	out[1] = in[1] * scalar;
				250	out[2] = in[2] * scalar;
				251	out[3] = in[3] * scalar;
				252	out[4] = in[4] * scalar;
				253	out[5] = in[5] * scalar;
				254	out[6] = in[6] * scalar;
				255	out[7] = in[7] * scalar;
				256	out[8] = in[8] * scalar;
				257	}
				258
				259	/* felem_scalar64 sets out = out * scalar */
				260	static void felem_scalar64(felem out, limb scalar)
				261	{
				262	out[0] *= scalar;
				263	out[1] *= scalar;
				264	out[2] *= scalar;
				265	out[3] *= scalar;
				266	out[4] *= scalar;
				267	out[5] *= scalar;
				268	out[6] *= scalar;
				269	out[7] *= scalar;
				270	out[8] *= scalar;
				271	}
				272
				273	/* felem_scalar128 sets out = out * scalar */
				274	static void felem_scalar128(largefelem out, limb scalar)
				275	{
				276	out[0] *= scalar;
				277	out[1] *= scalar;
				278	out[2] *= scalar;
				279	out[3] *= scalar;
				280	out[4] *= scalar;
				281	out[5] *= scalar;
				282	out[6] *= scalar;
				283	out[7] *= scalar;
				284	out[8] *= scalar;
				285	}
				286
				287	/*-
				288	* felem_neg sets \|out\| to \|-in\|
				289	* On entry:
				290	* in[i] < 2^59 + 2^14
				291	* On exit:
				292	* out[i] < 2^62
				293	*/
				294	static void felem_neg(felem out, const felem in)
				295	{
				296	/* In order to prevent underflow, we subtract from 0 mod p. */
				297	static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
				298	static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
				299
				300	out[0] = two62m3 - in[0];
				301	out[1] = two62m2 - in[1];
				302	out[2] = two62m2 - in[2];
				303	out[3] = two62m2 - in[3];
				304	out[4] = two62m2 - in[4];
				305	out[5] = two62m2 - in[5];
				306	out[6] = two62m2 - in[6];
				307	out[7] = two62m2 - in[7];
				308	out[8] = two62m2 - in[8];
				309	}
				310
				311	/*-
				312	* felem_diff64 subtracts \|in\| from \|out\|
				313	* On entry:
				314	* in[i] < 2^59 + 2^14
				315	* On exit:
				316	* out[i] < out[i] + 2^62
				317	*/
				318	static void felem_diff64(felem out, const felem in)
				319	{
				320	/*
				321	* In order to prevent underflow, we add 0 mod p before subtracting.
				322	*/
				323	static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
				324	static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
				325
				326	out[0] += two62m3 - in[0];
				327	out[1] += two62m2 - in[1];
				328	out[2] += two62m2 - in[2];
				329	out[3] += two62m2 - in[3];
				330	out[4] += two62m2 - in[4];
				331	out[5] += two62m2 - in[5];
				332	out[6] += two62m2 - in[6];
				333	out[7] += two62m2 - in[7];
				334	out[8] += two62m2 - in[8];
				335	}
				336
				337	/*-
				338	* felem_diff_128_64 subtracts \|in\| from \|out\|
				339	* On entry:
				340	* in[i] < 2^62 + 2^17
				341	* On exit:
				342	* out[i] < out[i] + 2^63
				343	*/
				344	static void felem_diff_128_64(largefelem out, const felem in)
				345	{
				346	/*
				347	* In order to prevent underflow, we add 64p mod p (which is equivalent
				348	* to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
				349	* digit number with all bits set to 1. See "The representation of field
				350	* elements" comment above for a description of how limbs are used to
				351	* represent a number. 64p is represented with 8 limbs containing a number
				352	* with 58 bits set and one limb with a number with 57 bits set.
				353	*/
				354	static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
				355	static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
				356
				357	out[0] += two63m6 - in[0];
				358	out[1] += two63m5 - in[1];
				359	out[2] += two63m5 - in[2];
				360	out[3] += two63m5 - in[3];
				361	out[4] += two63m5 - in[4];
				362	out[5] += two63m5 - in[5];
				363	out[6] += two63m5 - in[6];
				364	out[7] += two63m5 - in[7];
				365	out[8] += two63m5 - in[8];
				366	}
				367
				368	/*-
				369	* felem_diff_128_64 subtracts \|in\| from \|out\|
				370	* On entry:
				371	* in[i] < 2^126
				372	* On exit:
				373	* out[i] < out[i] + 2^127 - 2^69
				374	*/
				375	static void felem_diff128(largefelem out, const largefelem in)
				376	{
				377	/*
				378	* In order to prevent underflow, we add 0 mod p before subtracting.
				379	*/
				380	static const uint128_t two127m70 =
				381	(((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
				382	static const uint128_t two127m69 =
				383	(((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
				384
				385	out[0] += (two127m70 - in[0]);
				386	out[1] += (two127m69 - in[1]);
				387	out[2] += (two127m69 - in[2]);
				388	out[3] += (two127m69 - in[3]);
				389	out[4] += (two127m69 - in[4]);
				390	out[5] += (two127m69 - in[5]);
				391	out[6] += (two127m69 - in[6]);
				392	out[7] += (two127m69 - in[7]);
				393	out[8] += (two127m69 - in[8]);
				394	}
				395
				396	/*-
				397	* felem_square sets \|out\| = \|in\|^2
				398	* On entry:
				399	* in[i] < 2^62
				400	* On exit:
				401	* out[i] < 17 * max(in[i]) * max(in[i])
				402	*/
				403	static void felem_square(largefelem out, const felem in)
				404	{
				405	felem inx2, inx4;
				406	felem_scalar(inx2, in, 2);
				407	felem_scalar(inx4, in, 4);
				408
				409	/*-
				410	* We have many cases were we want to do
				411	* in[x] * in[y] +
				412	* in[y] * in[x]
				413	* This is obviously just
				414	* 2 * in[x] * in[y]
				415	* However, rather than do the doubling on the 128 bit result, we
				416	* double one of the inputs to the multiplication by reading from
				417	* \|inx2\|
				418	*/
				419
				420	out[0] = ((uint128_t) in[0]) * in[0];
				421	out[1] = ((uint128_t) in[0]) * inx2[1];
				422	out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
				423	out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
				424	out[4] = ((uint128_t) in[0]) * inx2[4] +
				425	((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
				426	out[5] = ((uint128_t) in[0]) * inx2[5] +
				427	((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
				428	out[6] = ((uint128_t) in[0]) * inx2[6] +
				429	((uint128_t) in[1]) * inx2[5] +
				430	((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
				431	out[7] = ((uint128_t) in[0]) * inx2[7] +
				432	((uint128_t) in[1]) * inx2[6] +
				433	((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
				434	out[8] = ((uint128_t) in[0]) * inx2[8] +
				435	((uint128_t) in[1]) * inx2[7] +
				436	((uint128_t) in[2]) * inx2[6] +
				437	((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
				438
				439	/*
				440	* The remaining limbs fall above 2^521, with the first falling at 2^522.
				441	* They correspond to locations one bit up from the limbs produced above
				442	* so we would have to multiply by two to align them. Again, rather than
				443	* operate on the 128-bit result, we double one of the inputs to the
				444	* multiplication. If we want to double for both this reason, and the
				445	* reason above, then we end up multiplying by four.
				446	*/
				447
				448	/* 9 */
				449	out[0] += ((uint128_t) in[1]) * inx4[8] +
				450	((uint128_t) in[2]) * inx4[7] +
				451	((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
				452
				453	/* 10 */
				454	out[1] += ((uint128_t) in[2]) * inx4[8] +
				455	((uint128_t) in[3]) * inx4[7] +
				456	((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
				457
				458	/* 11 */
				459	out[2] += ((uint128_t) in[3]) * inx4[8] +
				460	((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
				461
				462	/* 12 */
				463	out[3] += ((uint128_t) in[4]) * inx4[8] +
				464	((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
				465
				466	/* 13 */
				467	out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
				468
				469	/* 14 */
				470	out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
				471
				472	/* 15 */
				473	out[6] += ((uint128_t) in[7]) * inx4[8];
				474
				475	/* 16 */
				476	out[7] += ((uint128_t) in[8]) * inx2[8];
				477	}
				478
				479	/*-
				480	* felem_mul sets \|out\| = \|in1\| * \|in2\|
				481	* On entry:
				482	* in1[i] < 2^64
				483	* in2[i] < 2^63
				484	* On exit:
				485	* out[i] < 17 * max(in1[i]) * max(in2[i])
				486	*/
				487	static void felem_mul(largefelem out, const felem in1, const felem in2)
				488	{
				489	felem in2x2;
				490	felem_scalar(in2x2, in2, 2);
				491
				492	out[0] = ((uint128_t) in1[0]) * in2[0];
				493
				494	out[1] = ((uint128_t) in1[0]) * in2[1] +
				495	((uint128_t) in1[1]) * in2[0];
				496
				497	out[2] = ((uint128_t) in1[0]) * in2[2] +
				498	((uint128_t) in1[1]) * in2[1] +
				499	((uint128_t) in1[2]) * in2[0];
				500
				501	out[3] = ((uint128_t) in1[0]) * in2[3] +
				502	((uint128_t) in1[1]) * in2[2] +
				503	((uint128_t) in1[2]) * in2[1] +
				504	((uint128_t) in1[3]) * in2[0];
				505
				506	out[4] = ((uint128_t) in1[0]) * in2[4] +
				507	((uint128_t) in1[1]) * in2[3] +
				508	((uint128_t) in1[2]) * in2[2] +
				509	((uint128_t) in1[3]) * in2[1] +
				510	((uint128_t) in1[4]) * in2[0];
				511
				512	out[5] = ((uint128_t) in1[0]) * in2[5] +
				513	((uint128_t) in1[1]) * in2[4] +
				514	((uint128_t) in1[2]) * in2[3] +
				515	((uint128_t) in1[3]) * in2[2] +
				516	((uint128_t) in1[4]) * in2[1] +
				517	((uint128_t) in1[5]) * in2[0];
				518
				519	out[6] = ((uint128_t) in1[0]) * in2[6] +
				520	((uint128_t) in1[1]) * in2[5] +
				521	((uint128_t) in1[2]) * in2[4] +
				522	((uint128_t) in1[3]) * in2[3] +
				523	((uint128_t) in1[4]) * in2[2] +
				524	((uint128_t) in1[5]) * in2[1] +
				525	((uint128_t) in1[6]) * in2[0];
				526
				527	out[7] = ((uint128_t) in1[0]) * in2[7] +
				528	((uint128_t) in1[1]) * in2[6] +
				529	((uint128_t) in1[2]) * in2[5] +
				530	((uint128_t) in1[3]) * in2[4] +
				531	((uint128_t) in1[4]) * in2[3] +
				532	((uint128_t) in1[5]) * in2[2] +
				533	((uint128_t) in1[6]) * in2[1] +
				534	((uint128_t) in1[7]) * in2[0];
				535
				536	out[8] = ((uint128_t) in1[0]) * in2[8] +
				537	((uint128_t) in1[1]) * in2[7] +
				538	((uint128_t) in1[2]) * in2[6] +
				539	((uint128_t) in1[3]) * in2[5] +
				540	((uint128_t) in1[4]) * in2[4] +
				541	((uint128_t) in1[5]) * in2[3] +
				542	((uint128_t) in1[6]) * in2[2] +
				543	((uint128_t) in1[7]) * in2[1] +
				544	((uint128_t) in1[8]) * in2[0];
				545
				546	/* See comment in felem_square about the use of in2x2 here */
				547
				548	out[0] += ((uint128_t) in1[1]) * in2x2[8] +
				549	((uint128_t) in1[2]) * in2x2[7] +
				550	((uint128_t) in1[3]) * in2x2[6] +
				551	((uint128_t) in1[4]) * in2x2[5] +
				552	((uint128_t) in1[5]) * in2x2[4] +
				553	((uint128_t) in1[6]) * in2x2[3] +
				554	((uint128_t) in1[7]) * in2x2[2] +
				555	((uint128_t) in1[8]) * in2x2[1];
				556
				557	out[1] += ((uint128_t) in1[2]) * in2x2[8] +
				558	((uint128_t) in1[3]) * in2x2[7] +
				559	((uint128_t) in1[4]) * in2x2[6] +
				560	((uint128_t) in1[5]) * in2x2[5] +
				561	((uint128_t) in1[6]) * in2x2[4] +
				562	((uint128_t) in1[7]) * in2x2[3] +
				563	((uint128_t) in1[8]) * in2x2[2];
				564
				565	out[2] += ((uint128_t) in1[3]) * in2x2[8] +
				566	((uint128_t) in1[4]) * in2x2[7] +
				567	((uint128_t) in1[5]) * in2x2[6] +
				568	((uint128_t) in1[6]) * in2x2[5] +
				569	((uint128_t) in1[7]) * in2x2[4] +
				570	((uint128_t) in1[8]) * in2x2[3];
				571
				572	out[3] += ((uint128_t) in1[4]) * in2x2[8] +
				573	((uint128_t) in1[5]) * in2x2[7] +
				574	((uint128_t) in1[6]) * in2x2[6] +
				575	((uint128_t) in1[7]) * in2x2[5] +
				576	((uint128_t) in1[8]) * in2x2[4];
				577
				578	out[4] += ((uint128_t) in1[5]) * in2x2[8] +
				579	((uint128_t) in1[6]) * in2x2[7] +
				580	((uint128_t) in1[7]) * in2x2[6] +
				581	((uint128_t) in1[8]) * in2x2[5];
				582
				583	out[5] += ((uint128_t) in1[6]) * in2x2[8] +
				584	((uint128_t) in1[7]) * in2x2[7] +
				585	((uint128_t) in1[8]) * in2x2[6];
				586
				587	out[6] += ((uint128_t) in1[7]) * in2x2[8] +
				588	((uint128_t) in1[8]) * in2x2[7];
				589
				590	out[7] += ((uint128_t) in1[8]) * in2x2[8];
				591	}
				592
				593	static const limb bottom52bits = 0xfffffffffffff;
				594
				595	/*-
				596	* felem_reduce converts a largefelem to an felem.
				597	* On entry:
				598	* in[i] < 2^128
				599	* On exit:
				600	* out[i] < 2^59 + 2^14
				601	*/
				602	static void felem_reduce(felem out, const largefelem in)
				603	{
				604	u64 overflow1, overflow2;
				605
				606	out[0] = ((limb) in[0]) & bottom58bits;
				607	out[1] = ((limb) in[1]) & bottom58bits;
				608	out[2] = ((limb) in[2]) & bottom58bits;
				609	out[3] = ((limb) in[3]) & bottom58bits;
				610	out[4] = ((limb) in[4]) & bottom58bits;
				611	out[5] = ((limb) in[5]) & bottom58bits;
				612	out[6] = ((limb) in[6]) & bottom58bits;
				613	out[7] = ((limb) in[7]) & bottom58bits;
				614	out[8] = ((limb) in[8]) & bottom58bits;
				615
				616	/* out[i] < 2^58 */
				617
				618	out[1] += ((limb) in[0]) >> 58;
				619	out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
				620	/*-
				621	* out[1] < 2^58 + 2^6 + 2^58
				622	* = 2^59 + 2^6
				623	*/
				624	out[2] += ((limb) (in[0] >> 64)) >> 52;
				625
				626	out[2] += ((limb) in[1]) >> 58;
				627	out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
				628	out[3] += ((limb) (in[1] >> 64)) >> 52;
				629
				630	out[3] += ((limb) in[2]) >> 58;
				631	out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
				632	out[4] += ((limb) (in[2] >> 64)) >> 52;
				633
				634	out[4] += ((limb) in[3]) >> 58;
				635	out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
				636	out[5] += ((limb) (in[3] >> 64)) >> 52;
				637
				638	out[5] += ((limb) in[4]) >> 58;
				639	out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
				640	out[6] += ((limb) (in[4] >> 64)) >> 52;
				641
				642	out[6] += ((limb) in[5]) >> 58;
				643	out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
				644	out[7] += ((limb) (in[5] >> 64)) >> 52;
				645
				646	out[7] += ((limb) in[6]) >> 58;
				647	out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
				648	out[8] += ((limb) (in[6] >> 64)) >> 52;
				649
				650	out[8] += ((limb) in[7]) >> 58;
				651	out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
				652	/*-
				653	* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
				654	* < 2^59 + 2^13
				655	*/
				656	overflow1 = ((limb) (in[7] >> 64)) >> 52;
				657
				658	overflow1 += ((limb) in[8]) >> 58;
				659	overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
				660	overflow2 = ((limb) (in[8] >> 64)) >> 52;
				661
				662	overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
				663	overflow2 <<= 1; /* overflow2 < 2^13 */
				664
				665	out[0] += overflow1; /* out[0] < 2^60 */
				666	out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
				667
				668	out[1] += out[0] >> 58;
				669	out[0] &= bottom58bits;
				670	/*-
				671	* out[0] < 2^58
				672	* out[1] < 2^59 + 2^6 + 2^13 + 2^2
				673	* < 2^59 + 2^14
				674	*/
				675	}
				676
				677	static void felem_square_reduce(felem out, const felem in)
				678	{
				679	largefelem tmp;
				680	felem_square(tmp, in);
				681	felem_reduce(out, tmp);
				682	}
				683
				684	static void felem_mul_reduce(felem out, const felem in1, const felem in2)
				685	{
				686	largefelem tmp;
				687	felem_mul(tmp, in1, in2);
				688	felem_reduce(out, tmp);
				689	}
				690
				691	/*-
				692	* felem_inv calculates \|out\| = \|in\|^{-1}
				693	*
				694	* Based on Fermat's Little Theorem:
				695	* a^p = a (mod p)
				696	* a^{p-1} = 1 (mod p)
				697	* a^{p-2} = a^{-1} (mod p)
				698	*/
				699	static void felem_inv(felem out, const felem in)
				700	{
				701	felem ftmp, ftmp2, ftmp3, ftmp4;
				702	largefelem tmp;
				703	unsigned i;
				704
				705	felem_square(tmp, in);
				706	felem_reduce(ftmp, tmp); /* 2^1 */
				707	felem_mul(tmp, in, ftmp);
				708	felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
				709	felem_assign(ftmp2, ftmp);
				710	felem_square(tmp, ftmp);
				711	felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
				712	felem_mul(tmp, in, ftmp);
				713	felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
				714	felem_square(tmp, ftmp);
				715	felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
				716
				717	felem_square(tmp, ftmp2);
				718	felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
				719	felem_square(tmp, ftmp3);
				720	felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
				721	felem_mul(tmp, ftmp3, ftmp2);
				722	felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
				723
				724	felem_assign(ftmp2, ftmp3);
				725	felem_square(tmp, ftmp3);
				726	felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
				727	felem_square(tmp, ftmp3);
				728	felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
				729	felem_square(tmp, ftmp3);
				730	felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
				731	felem_square(tmp, ftmp3);
				732	felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
				733	felem_assign(ftmp4, ftmp3);
				734	felem_mul(tmp, ftmp3, ftmp);
				735	felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
				736	felem_square(tmp, ftmp4);
				737	felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
				738	felem_mul(tmp, ftmp3, ftmp2);
				739	felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
				740	felem_assign(ftmp2, ftmp3);
				741
				742	for (i = 0; i < 8; i++) {
				743	felem_square(tmp, ftmp3);
				744	felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
				745	}
				746	felem_mul(tmp, ftmp3, ftmp2);
				747	felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
				748	felem_assign(ftmp2, ftmp3);
				749
				750	for (i = 0; i < 16; i++) {
				751	felem_square(tmp, ftmp3);
				752	felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
				753	}
				754	felem_mul(tmp, ftmp3, ftmp2);
				755	felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
				756	felem_assign(ftmp2, ftmp3);
				757
				758	for (i = 0; i < 32; i++) {
				759	felem_square(tmp, ftmp3);
				760	felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
				761	}
				762	felem_mul(tmp, ftmp3, ftmp2);
				763	felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
				764	felem_assign(ftmp2, ftmp3);
				765
				766	for (i = 0; i < 64; i++) {
				767	felem_square(tmp, ftmp3);
				768	felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
				769	}
				770	felem_mul(tmp, ftmp3, ftmp2);
				771	felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
				772	felem_assign(ftmp2, ftmp3);
				773
				774	for (i = 0; i < 128; i++) {
				775	felem_square(tmp, ftmp3);
				776	felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
				777	}
				778	felem_mul(tmp, ftmp3, ftmp2);
				779	felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
				780	felem_assign(ftmp2, ftmp3);
				781
				782	for (i = 0; i < 256; i++) {
				783	felem_square(tmp, ftmp3);
				784	felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
				785	}
				786	felem_mul(tmp, ftmp3, ftmp2);
				787	felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
				788
				789	for (i = 0; i < 9; i++) {
				790	felem_square(tmp, ftmp3);
				791	felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
				792	}
				793	felem_mul(tmp, ftmp3, ftmp4);
				794	felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
				795	felem_mul(tmp, ftmp3, in);
				796	felem_reduce(out, tmp); /* 2^512 - 3 */
				797	}
				798
				799	/* This is 2^521-1, expressed as an felem */
				800	static const felem kPrime = {
				801	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
				802	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
				803	0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
				804	};
				805
				806	/*-
				807	* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
				808	* otherwise.
				809	* On entry:
				810	* in[i] < 2^59 + 2^14
				811	*/
				812	static limb felem_is_zero(const felem in)
				813	{
				814	felem ftmp;
				815	limb is_zero, is_p;
				816	felem_assign(ftmp, in);
				817
				818	ftmp[0] += ftmp[8] >> 57;
				819	ftmp[8] &= bottom57bits;
				820	/* ftmp[8] < 2^57 */
				821	ftmp[1] += ftmp[0] >> 58;
				822	ftmp[0] &= bottom58bits;
				823	ftmp[2] += ftmp[1] >> 58;
				824	ftmp[1] &= bottom58bits;
				825	ftmp[3] += ftmp[2] >> 58;
				826	ftmp[2] &= bottom58bits;
				827	ftmp[4] += ftmp[3] >> 58;
				828	ftmp[3] &= bottom58bits;
				829	ftmp[5] += ftmp[4] >> 58;
				830	ftmp[4] &= bottom58bits;
				831	ftmp[6] += ftmp[5] >> 58;
				832	ftmp[5] &= bottom58bits;
				833	ftmp[7] += ftmp[6] >> 58;
				834	ftmp[6] &= bottom58bits;
				835	ftmp[8] += ftmp[7] >> 58;
				836	ftmp[7] &= bottom58bits;
				837	/* ftmp[8] < 2^57 + 4 */
				838
				839	/*
				840	* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
				841	* than our bound for ftmp[8]. Therefore we only have to check if the
				842	* zero is zero or 2^521-1.
				843	*/
				844
				845	is_zero = 0;
				846	is_zero \|= ftmp[0];
				847	is_zero \|= ftmp[1];
				848	is_zero \|= ftmp[2];
				849	is_zero \|= ftmp[3];
				850	is_zero \|= ftmp[4];
				851	is_zero \|= ftmp[5];
				852	is_zero \|= ftmp[6];
				853	is_zero \|= ftmp[7];
				854	is_zero \|= ftmp[8];
				855
				856	is_zero--;
				857	/*
				858	* We know that ftmp[i] < 2^63, therefore the only way that the top bit
				859	* can be set is if is_zero was 0 before the decrement.
				860	*/
				861	is_zero = 0 - (is_zero >> 63);
				862
				863	is_p = ftmp[0] ^ kPrime[0];
				864	is_p \|= ftmp[1] ^ kPrime[1];
				865	is_p \|= ftmp[2] ^ kPrime[2];
				866	is_p \|= ftmp[3] ^ kPrime[3];
				867	is_p \|= ftmp[4] ^ kPrime[4];
				868	is_p \|= ftmp[5] ^ kPrime[5];
				869	is_p \|= ftmp[6] ^ kPrime[6];
				870	is_p \|= ftmp[7] ^ kPrime[7];
				871	is_p \|= ftmp[8] ^ kPrime[8];
				872
				873	is_p--;
				874	is_p = 0 - (is_p >> 63);
				875
				876	is_zero \|= is_p;
				877	return is_zero;
				878	}
				879
				880	static int felem_is_zero_int(const void *in)
				881	{
				882	return (int)(felem_is_zero(in) & ((limb) 1));
				883	}
				884
				885	/*-
				886	* felem_contract converts \|in\| to its unique, minimal representation.
				887	* On entry:
				888	* in[i] < 2^59 + 2^14
				889	*/
				890	static void felem_contract(felem out, const felem in)
				891	{
				892	limb is_p, is_greater, sign;
				893	static const limb two58 = ((limb) 1) << 58;
				894
				895	felem_assign(out, in);
				896
				897	out[0] += out[8] >> 57;
				898	out[8] &= bottom57bits;
				899	/* out[8] < 2^57 */
				900	out[1] += out[0] >> 58;
				901	out[0] &= bottom58bits;
				902	out[2] += out[1] >> 58;
				903	out[1] &= bottom58bits;
				904	out[3] += out[2] >> 58;
				905	out[2] &= bottom58bits;
				906	out[4] += out[3] >> 58;
				907	out[3] &= bottom58bits;
				908	out[5] += out[4] >> 58;
				909	out[4] &= bottom58bits;
				910	out[6] += out[5] >> 58;
				911	out[5] &= bottom58bits;
				912	out[7] += out[6] >> 58;
				913	out[6] &= bottom58bits;
				914	out[8] += out[7] >> 58;
				915	out[7] &= bottom58bits;
				916	/* out[8] < 2^57 + 4 */
				917
				918	/*
				919	* If the value is greater than 2^521-1 then we have to subtract 2^521-1
				920	* out. See the comments in felem_is_zero regarding why we don't test for
				921	* other multiples of the prime.
				922	*/
				923
				924	/*
				925	* First, if \|out\| is equal to 2^521-1, we subtract it out to get zero.
				926	*/
				927
				928	is_p = out[0] ^ kPrime[0];
				929	is_p \|= out[1] ^ kPrime[1];
				930	is_p \|= out[2] ^ kPrime[2];
				931	is_p \|= out[3] ^ kPrime[3];
				932	is_p \|= out[4] ^ kPrime[4];
				933	is_p \|= out[5] ^ kPrime[5];
				934	is_p \|= out[6] ^ kPrime[6];
				935	is_p \|= out[7] ^ kPrime[7];
				936	is_p \|= out[8] ^ kPrime[8];
				937
				938	is_p--;
				939	is_p &= is_p << 32;
				940	is_p &= is_p << 16;
				941	is_p &= is_p << 8;
				942	is_p &= is_p << 4;
				943	is_p &= is_p << 2;
				944	is_p &= is_p << 1;
				945	is_p = 0 - (is_p >> 63);
				946	is_p = ~is_p;
				947
				948	/* is_p is 0 iff \|out\| == 2^521-1 and all ones otherwise */
				949
				950	out[0] &= is_p;
				951	out[1] &= is_p;
				952	out[2] &= is_p;
				953	out[3] &= is_p;
				954	out[4] &= is_p;
				955	out[5] &= is_p;
				956	out[6] &= is_p;
				957	out[7] &= is_p;
				958	out[8] &= is_p;
				959
				960	/*
				961	* In order to test that \|out\| >= 2^521-1 we need only test if out[8] >>
				962	* 57 is greater than zero as (2^521-1) + x >= 2^522
				963	*/
				964	is_greater = out[8] >> 57;
				965	is_greater \|= is_greater << 32;
				966	is_greater \|= is_greater << 16;
				967	is_greater \|= is_greater << 8;
				968	is_greater \|= is_greater << 4;
				969	is_greater \|= is_greater << 2;
				970	is_greater \|= is_greater << 1;
				971	is_greater = 0 - (is_greater >> 63);
				972
				973	out[0] -= kPrime[0] & is_greater;
				974	out[1] -= kPrime[1] & is_greater;
				975	out[2] -= kPrime[2] & is_greater;
				976	out[3] -= kPrime[3] & is_greater;
				977	out[4] -= kPrime[4] & is_greater;
				978	out[5] -= kPrime[5] & is_greater;
				979	out[6] -= kPrime[6] & is_greater;
				980	out[7] -= kPrime[7] & is_greater;
				981	out[8] -= kPrime[8] & is_greater;
				982
				983	/* Eliminate negative coefficients */
				984	sign = -(out[0] >> 63);
				985	out[0] += (two58 & sign);
				986	out[1] -= (1 & sign);
				987	sign = -(out[1] >> 63);
				988	out[1] += (two58 & sign);
				989	out[2] -= (1 & sign);
				990	sign = -(out[2] >> 63);
				991	out[2] += (two58 & sign);
				992	out[3] -= (1 & sign);
				993	sign = -(out[3] >> 63);
				994	out[3] += (two58 & sign);
				995	out[4] -= (1 & sign);
				996	sign = -(out[4] >> 63);
				997	out[4] += (two58 & sign);
				998	out[5] -= (1 & sign);
				999	sign = -(out[0] >> 63);
				1000	out[5] += (two58 & sign);
				1001	out[6] -= (1 & sign);
				1002	sign = -(out[6] >> 63);
				1003	out[6] += (two58 & sign);
				1004	out[7] -= (1 & sign);
				1005	sign = -(out[7] >> 63);
				1006	out[7] += (two58 & sign);
				1007	out[8] -= (1 & sign);
				1008	sign = -(out[5] >> 63);
				1009	out[5] += (two58 & sign);
				1010	out[6] -= (1 & sign);
				1011	sign = -(out[6] >> 63);
				1012	out[6] += (two58 & sign);
				1013	out[7] -= (1 & sign);
				1014	sign = -(out[7] >> 63);
				1015	out[7] += (two58 & sign);
				1016	out[8] -= (1 & sign);
				1017	}
				1018
				1019	/*-
				1020	* Group operations
				1021	* ----------------
				1022	*
				1023	* Building on top of the field operations we have the operations on the
				1024	* elliptic curve group itself. Points on the curve are represented in Jacobian
				1025	* coordinates */
				1026
				1027	/*-
				1028	* point_double calculates 2*(x_in, y_in, z_in)
				1029	*
				1030	* The method is taken from:
				1031	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
				1032	*
				1033	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
				1034	* while x_out == y_in is not (maybe this works, but it's not tested). */
				1035	static void
				1036	point_double(felem x_out, felem y_out, felem z_out,
				1037	const felem x_in, const felem y_in, const felem z_in)
				1038	{
				1039	largefelem tmp, tmp2;
				1040	felem delta, gamma, beta, alpha, ftmp, ftmp2;
				1041
				1042	felem_assign(ftmp, x_in);
				1043	felem_assign(ftmp2, x_in);
				1044
				1045	/* delta = z^2 */
				1046	felem_square(tmp, z_in);
				1047	felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
				1048
				1049	/* gamma = y^2 */
				1050	felem_square(tmp, y_in);
				1051	felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
				1052
				1053	/* beta = xgamma /
				1054	felem_mul(tmp, x_in, gamma);
				1055	felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
				1056
				1057	/* alpha = 3(x-delta)(x+delta) */
				1058	felem_diff64(ftmp, delta);
				1059	/* ftmp[i] < 2^61 */
				1060	felem_sum64(ftmp2, delta);
				1061	/* ftmp2[i] < 2^60 + 2^15 */
				1062	felem_scalar64(ftmp2, 3);
				1063	/* ftmp2[i] < 32^60 + 32^15 */
				1064	felem_mul(tmp, ftmp, ftmp2);
				1065	/*-
				1066	* tmp[i] < 17(32^121 + 32^76)
				1067	* = 612^121 + 612^76
				1068	* < 642^121 + 642^76
				1069	* = 2^127 + 2^82
				1070	* < 2^128
				1071	*/
				1072	felem_reduce(alpha, tmp);
				1073
				1074	/* x' = alpha^2 - 8beta /
				1075	felem_square(tmp, alpha);
				1076	/*
				1077	* tmp[i] < 17*2^120 < 2^125
				1078	*/
				1079	felem_assign(ftmp, beta);
				1080	felem_scalar64(ftmp, 8);
				1081	/* ftmp[i] < 2^62 + 2^17 */
				1082	felem_diff_128_64(tmp, ftmp);
				1083	/* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
				1084	felem_reduce(x_out, tmp);
				1085
				1086	/* z' = (y + z)^2 - gamma - delta */
				1087	felem_sum64(delta, gamma);
				1088	/* delta[i] < 2^60 + 2^15 */
				1089	felem_assign(ftmp, y_in);
				1090	felem_sum64(ftmp, z_in);
				1091	/* ftmp[i] < 2^60 + 2^15 */
				1092	felem_square(tmp, ftmp);
				1093	/*
				1094	* tmp[i] < 17(2^122) < 2^127
				1095	*/
				1096	felem_diff_128_64(tmp, delta);
				1097	/* tmp[i] < 2^127 + 2^63 */
				1098	felem_reduce(z_out, tmp);
				1099
				1100	/* y' = alpha(4beta - x') - 8gamma^2 /
				1101	felem_scalar64(beta, 4);
				1102	/* beta[i] < 2^61 + 2^16 */
				1103	felem_diff64(beta, x_out);
				1104	/* beta[i] < 2^61 + 2^60 + 2^16 */
				1105	felem_mul(tmp, alpha, beta);
				1106	/*-
				1107	* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
				1108	* = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
				1109	* = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
				1110	* < 2^128
				1111	*/
				1112	felem_square(tmp2, gamma);
				1113	/*-
				1114	* tmp2[i] < 17*(2^59 + 2^14)^2
				1115	* = 17*(2^118 + 2^74 + 2^28)
				1116	*/
				1117	felem_scalar128(tmp2, 8);
				1118	/*-
				1119	* tmp2[i] < 817(2^118 + 2^74 + 2^28)
				1120	* = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
				1121	* < 2^126
				1122	*/
				1123	felem_diff128(tmp, tmp2);
				1124	/*-
				1125	* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
				1126	* = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
				1127	* 2^74 + 2^69 + 2^34 + 2^30
				1128	* < 2^128
				1129	*/
				1130	felem_reduce(y_out, tmp);
				1131	}
				1132
				1133	/* copy_conditional copies in to out iff mask is all ones. */
				1134	static void copy_conditional(felem out, const felem in, limb mask)
				1135	{
				1136	unsigned i;
				1137	for (i = 0; i < NLIMBS; ++i) {
				1138	const limb tmp = mask & (in[i] ^ out[i]);
				1139	out[i] ^= tmp;
				1140	}
				1141	}
				1142
				1143	/*-
				1144	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
				1145	*
				1146	* The method is taken from
				1147	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
				1148	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
				1149	*
				1150	* This function includes a branch for checking whether the two input points
				1151	* are equal (while not equal to the point at infinity). See comment below
				1152	* on constant-time.
				1153	*/
				1154	static void point_add(felem x3, felem y3, felem z3,
				1155	const felem x1, const felem y1, const felem z1,
				1156	const int mixed, const felem x2, const felem y2,
				1157	const felem z2)
				1158	{
				1159	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
				1160	largefelem tmp, tmp2;
				1161	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
				1162	limb points_equal;
				1163
				1164	z1_is_zero = felem_is_zero(z1);
				1165	z2_is_zero = felem_is_zero(z2);
				1166
				1167	/* ftmp = z1z1 = z1*2 /
				1168	felem_square(tmp, z1);
				1169	felem_reduce(ftmp, tmp);
				1170
				1171	if (!mixed) {
				1172	/* ftmp2 = z2z2 = z2*2 /
				1173	felem_square(tmp, z2);
				1174	felem_reduce(ftmp2, tmp);
				1175
				1176	/* u1 = ftmp3 = x1z2z2 /
				1177	felem_mul(tmp, x1, ftmp2);
				1178	felem_reduce(ftmp3, tmp);
				1179
				1180	/* ftmp5 = z1 + z2 */
				1181	felem_assign(ftmp5, z1);
				1182	felem_sum64(ftmp5, z2);
				1183	/* ftmp5[i] < 2^61 */
				1184
				1185	/* ftmp5 = (z1 + z2)*2 - z1z1 - z2z2 = 2z1z2 */
				1186	felem_square(tmp, ftmp5);
				1187	/* tmp[i] < 172^122 /
				1188	felem_diff_128_64(tmp, ftmp);
				1189	/* tmp[i] < 172^122 + 2^63 /
				1190	felem_diff_128_64(tmp, ftmp2);
				1191	/* tmp[i] < 172^122 + 2^64 /
				1192	felem_reduce(ftmp5, tmp);
				1193
				1194	/* ftmp2 = z2 * z2z2 */
				1195	felem_mul(tmp, ftmp2, z2);
				1196	felem_reduce(ftmp2, tmp);
				1197
				1198	/* s1 = ftmp6 = y1 * z2*3 /
				1199	felem_mul(tmp, y1, ftmp2);
				1200	felem_reduce(ftmp6, tmp);
				1201	} else {
				1202	/*
				1203	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
				1204	*/
				1205
				1206	/* u1 = ftmp3 = x1z2z2 /
				1207	felem_assign(ftmp3, x1);
				1208
				1209	/* ftmp5 = 2z1z2 /
				1210	felem_scalar(ftmp5, z1, 2);
				1211
				1212	/* s1 = ftmp6 = y1 * z2*3 /
				1213	felem_assign(ftmp6, y1);
				1214	}
				1215
				1216	/* u2 = x2z1z1 /
				1217	felem_mul(tmp, x2, ftmp);
				1218	/* tmp[i] < 172^120 /
				1219
				1220	/* h = ftmp4 = u2 - u1 */
				1221	felem_diff_128_64(tmp, ftmp3);
				1222	/* tmp[i] < 172^120 + 2^63 /
				1223	felem_reduce(ftmp4, tmp);
				1224
				1225	x_equal = felem_is_zero(ftmp4);
				1226
				1227	/* z_out = ftmp5 * h */
				1228	felem_mul(tmp, ftmp5, ftmp4);
				1229	felem_reduce(z_out, tmp);
				1230
				1231	/* ftmp = z1 * z1z1 */
				1232	felem_mul(tmp, ftmp, z1);
				1233	felem_reduce(ftmp, tmp);
				1234
				1235	/* s2 = tmp = y2 * z1*3 /
				1236	felem_mul(tmp, y2, ftmp);
				1237	/* tmp[i] < 172^120 /
				1238
				1239	/* r = ftmp5 = (s2 - s1)2 /
				1240	felem_diff_128_64(tmp, ftmp6);
				1241	/* tmp[i] < 172^120 + 2^63 /
				1242	felem_reduce(ftmp5, tmp);
				1243	y_equal = felem_is_zero(ftmp5);
				1244	felem_scalar64(ftmp5, 2);
				1245	/* ftmp5[i] < 2^61 */
				1246
				1247	/*
				1248	* The formulae are incorrect if the points are equal, in affine coordinates
				1249	* (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
				1250	* happens.
				1251	*
				1252	* We use bitwise operations to avoid potential side-channels introduced by
				1253	* the short-circuiting behaviour of boolean operators.
				1254	*
				1255	* The special case of either point being the point at infinity (z1 and/or
				1256	* z2 are zero), is handled separately later on in this function, so we
				1257	* avoid jumping to point_double here in those special cases.
				1258	*
				1259	* Notice the comment below on the implications of this branching for timing
				1260	* leaks and why it is considered practically irrelevant.
				1261	*/
				1262	points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
				1263
				1264	if (points_equal) {
				1265	/*
				1266	* This is obviously not constant-time but it will almost-never happen
				1267	* for ECDH / ECDSA. The case where it can happen is during scalar-mult
				1268	* where the intermediate value gets very close to the group order.
				1269	* Since \|ec_GFp_nistp_recode_scalar_bits\| produces signed digits for
				1270	* the scalar, it's possible for the intermediate value to be a small
				1271	* negative multiple of the base point, and for the final signed digit
				1272	* to be the same value. We believe that this only occurs for the scalar
				1273	* 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
				1274	* ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
				1275	* 71e913863f7, in that case the penultimate intermediate is -9G and
				1276	* the final digit is also -9G. Since this only happens for a single
				1277	* scalar, the timing leak is irrelevant. (Any attacker who wanted to
				1278	* check whether a secret scalar was that exact value, can already do
				1279	* so.)
				1280	*/
				1281	point_double(x3, y3, z3, x1, y1, z1);
				1282	return;
				1283	}
				1284
				1285	/* I = ftmp = (2h)*2 /
				1286	felem_assign(ftmp, ftmp4);
				1287	felem_scalar64(ftmp, 2);
				1288	/* ftmp[i] < 2^61 */
				1289	felem_square(tmp, ftmp);
				1290	/* tmp[i] < 172^122 /
				1291	felem_reduce(ftmp, tmp);
				1292
				1293	/* J = ftmp2 = h * I */
				1294	felem_mul(tmp, ftmp4, ftmp);
				1295	felem_reduce(ftmp2, tmp);
				1296
				1297	/* V = ftmp4 = U1 * I */
				1298	felem_mul(tmp, ftmp3, ftmp);
				1299	felem_reduce(ftmp4, tmp);
				1300
				1301	/* x_out = r*2 - J - 2V /
				1302	felem_square(tmp, ftmp5);
				1303	/* tmp[i] < 172^122 /
				1304	felem_diff_128_64(tmp, ftmp2);
				1305	/* tmp[i] < 172^122 + 2^63 /
				1306	felem_assign(ftmp3, ftmp4);
				1307	felem_scalar64(ftmp4, 2);
				1308	/* ftmp4[i] < 2^61 */
				1309	felem_diff_128_64(tmp, ftmp4);
				1310	/* tmp[i] < 172^122 + 2^64 /
				1311	felem_reduce(x_out, tmp);
				1312
				1313	/* y_out = r(V-x_out) - 2 * s1 * J */
				1314	felem_diff64(ftmp3, x_out);
				1315	/*
				1316	* ftmp3[i] < 2^60 + 2^60 = 2^61
				1317	*/
				1318	felem_mul(tmp, ftmp5, ftmp3);
				1319	/* tmp[i] < 172^122 /
				1320	felem_mul(tmp2, ftmp6, ftmp2);
				1321	/* tmp2[i] < 172^120 /
				1322	felem_scalar128(tmp2, 2);
				1323	/* tmp2[i] < 172^121 /
				1324	felem_diff128(tmp, tmp2);
				1325	/*-
				1326	* tmp[i] < 2^127 - 2^69 + 17*2^122
				1327	* = 2^126 - 2^122 - 2^6 - 2^2 - 1
				1328	* < 2^127
				1329	*/
				1330	felem_reduce(y_out, tmp);
				1331
				1332	copy_conditional(x_out, x2, z1_is_zero);
				1333	copy_conditional(x_out, x1, z2_is_zero);
				1334	copy_conditional(y_out, y2, z1_is_zero);
				1335	copy_conditional(y_out, y1, z2_is_zero);
				1336	copy_conditional(z_out, z2, z1_is_zero);
				1337	copy_conditional(z_out, z1, z2_is_zero);
				1338	felem_assign(x3, x_out);
				1339	felem_assign(y3, y_out);
				1340	felem_assign(z3, z_out);
				1341	}
				1342
				1343	/*-
				1344	* Base point pre computation
				1345	* --------------------------
				1346	*
				1347	* Two different sorts of precomputed tables are used in the following code.
				1348	* Each contain various points on the curve, where each point is three field
				1349	* elements (x, y, z).
				1350	*
				1351	* For the base point table, z is usually 1 (0 for the point at infinity).
				1352	* This table has 16 elements:
				1353	* index \| bits \| point
				1354	* ------+---------+------------------------------
				1355	* 0 \| 0 0 0 0 \| 0G
				1356	* 1 \| 0 0 0 1 \| 1G
				1357	* 2 \| 0 0 1 0 \| 2^130G
				1358	* 3 \| 0 0 1 1 \| (2^130 + 1)G
				1359	* 4 \| 0 1 0 0 \| 2^260G
				1360	* 5 \| 0 1 0 1 \| (2^260 + 1)G
				1361	* 6 \| 0 1 1 0 \| (2^260 + 2^130)G
				1362	* 7 \| 0 1 1 1 \| (2^260 + 2^130 + 1)G
				1363	* 8 \| 1 0 0 0 \| 2^390G
				1364	* 9 \| 1 0 0 1 \| (2^390 + 1)G
				1365	* 10 \| 1 0 1 0 \| (2^390 + 2^130)G
				1366	* 11 \| 1 0 1 1 \| (2^390 + 2^130 + 1)G
				1367	* 12 \| 1 1 0 0 \| (2^390 + 2^260)G
				1368	* 13 \| 1 1 0 1 \| (2^390 + 2^260 + 1)G
				1369	* 14 \| 1 1 1 0 \| (2^390 + 2^260 + 2^130)G
				1370	* 15 \| 1 1 1 1 \| (2^390 + 2^260 + 2^130 + 1)G
				1371	*
				1372	* The reason for this is so that we can clock bits into four different
				1373	* locations when doing simple scalar multiplies against the base point.
				1374	*
				1375	* Tables for other points have table[i] = iG for i in 0 .. 16. */
				1376
				1377	/* gmul is the table of precomputed base points */
				1378	static const felem gmul[16][3] = {
				1379	{{0, 0, 0, 0, 0, 0, 0, 0, 0},
				1380	{0, 0, 0, 0, 0, 0, 0, 0, 0},
				1381	{0, 0, 0, 0, 0, 0, 0, 0, 0}},
				1382	{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
				1383	0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
				1384	0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
				1385	{0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
				1386	0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
				1387	0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
				1388	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1389	{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
				1390	0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
				1391	0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
				1392	{0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
				1393	0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
				1394	0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
				1395	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1396	{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
				1397	0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
				1398	0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
				1399	{0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
				1400	0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
				1401	0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
				1402	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1403	{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
				1404	0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
				1405	0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
				1406	{0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
				1407	0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
				1408	0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
				1409	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1410	{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
				1411	0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
				1412	0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
				1413	{0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
				1414	0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
				1415	0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
				1416	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1417	{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
				1418	0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
				1419	0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
				1420	{0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
				1421	0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
				1422	0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
				1423	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1424	{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
				1425	0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
				1426	0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
				1427	{0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
				1428	0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
				1429	0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
				1430	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1431	{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
				1432	0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
				1433	0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
				1434	{0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
				1435	0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
				1436	0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
				1437	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1438	{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
				1439	0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
				1440	0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
				1441	{0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
				1442	0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
				1443	0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
				1444	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1445	{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
				1446	0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
				1447	0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
				1448	{0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
				1449	0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
				1450	0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
				1451	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1452	{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
				1453	0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
				1454	0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
				1455	{0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
				1456	0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
				1457	0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
				1458	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1459	{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
				1460	0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
				1461	0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
				1462	{0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
				1463	0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
				1464	0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
				1465	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1466	{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
				1467	0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
				1468	0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
				1469	{0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
				1470	0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
				1471	0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
				1472	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1473	{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
				1474	0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
				1475	0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
				1476	{0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
				1477	0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
				1478	0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
				1479	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
				1480	{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
				1481	0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
				1482	0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
				1483	{0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
				1484	0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
				1485	0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
				1486	{1, 0, 0, 0, 0, 0, 0, 0, 0}}
				1487	};
				1488
				1489	/*
				1490	* select_point selects the \|idx\|th point from a precomputation table and
				1491	* copies it to out.
				1492	*/
				1493	/* pre_comp below is of the size provided in \|size\| */
				1494	static void select_point(const limb idx, unsigned int size,
				1495	const felem pre_comp[][3], felem out[3])
				1496	{
				1497	unsigned i, j;
				1498	limb *outlimbs = &out[0][0];
				1499
				1500	memset(out, 0, sizeof(out) 3);
				1501
				1502	for (i = 0; i < size; i++) {
				1503	const limb *inlimbs = &pre_comp[i][0][0];
				1504	limb mask = i ^ idx;
				1505	mask \|= mask >> 4;
				1506	mask \|= mask >> 2;
				1507	mask \|= mask >> 1;
				1508	mask &= 1;
				1509	mask--;
				1510	for (j = 0; j < NLIMBS * 3; j++)
				1511	outlimbs[j] \|= inlimbs[j] & mask;
				1512	}
				1513	}
				1514
				1515	/* get_bit returns the \|i\|th bit in \|in\| */
				1516	static char get_bit(const felem_bytearray in, int i)
				1517	{
				1518	if (i < 0)
				1519	return 0;
				1520	return (in[i >> 3] >> (i & 7)) & 1;
				1521	}
				1522
				1523	/*
				1524	* Interleaved point multiplication using precomputed point multiples: The
				1525	* small point multiples 0P, 1P, ..., 16*P are in pre_comp[], the scalars
				1526	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
				1527	* generator, using certain (large) precomputed multiples in g_pre_comp.
				1528	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
				1529	*/
				1530	static void batch_mul(felem x_out, felem y_out, felem z_out,
				1531	const felem_bytearray scalars[],
				1532	const unsigned num_points, const u8 *g_scalar,
				1533	const int mixed, const felem pre_comp[][17][3],
				1534	const felem g_pre_comp[16][3])
				1535	{
				1536	int i, skip;
				1537	unsigned num, gen_mul = (g_scalar != NULL);
				1538	felem nq[3], tmp[4];
				1539	limb bits;
				1540	u8 sign, digit;
				1541
				1542	/* set nq to the point at infinity */
				1543	memset(nq, 0, sizeof(nq));
				1544
				1545	/*
				1546	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
				1547	* of the generator (last quarter of rounds) and additions of other
				1548	* points multiples (every 5th round).
				1549	*/
				1550	skip = 1; /* save two point operations in the first
				1551	* round */
				1552	for (i = (num_points ? 520 : 130); i >= 0; --i) {
				1553	/* double */
				1554	if (!skip)
				1555	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
				1556
				1557	/* add multiples of the generator */
				1558	if (gen_mul && (i <= 130)) {
				1559	bits = get_bit(g_scalar, i + 390) << 3;
				1560	if (i < 130) {
				1561	bits \|= get_bit(g_scalar, i + 260) << 2;
				1562	bits \|= get_bit(g_scalar, i + 130) << 1;
				1563	bits \|= get_bit(g_scalar, i);
				1564	}
				1565	/* select the point to add, in constant time */
				1566	select_point(bits, 16, g_pre_comp, tmp);
				1567	if (!skip) {
				1568	/* The 1 argument below is for "mixed" */
				1569	point_add(nq[0], nq[1], nq[2],
				1570	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
				1571	} else {
				1572	memcpy(nq, tmp, 3 * sizeof(felem));
				1573	skip = 0;
				1574	}
				1575	}
				1576
				1577	/* do other additions every 5 doublings */
				1578	if (num_points && (i % 5 == 0)) {
				1579	/* loop over all scalars */
				1580	for (num = 0; num < num_points; ++num) {
				1581	bits = get_bit(scalars[num], i + 4) << 5;
				1582	bits \|= get_bit(scalars[num], i + 3) << 4;
				1583	bits \|= get_bit(scalars[num], i + 2) << 3;
				1584	bits \|= get_bit(scalars[num], i + 1) << 2;
				1585	bits \|= get_bit(scalars[num], i) << 1;
				1586	bits \|= get_bit(scalars[num], i - 1);
				1587	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
				1588
				1589	/*
				1590	* select the point to add or subtract, in constant time
				1591	*/
				1592	select_point(digit, 17, pre_comp[num], tmp);
				1593	felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
				1594	* point */
				1595	copy_conditional(tmp[1], tmp[3], (-(limb) sign));
				1596
				1597	if (!skip) {
				1598	point_add(nq[0], nq[1], nq[2],
				1599	nq[0], nq[1], nq[2],
				1600	mixed, tmp[0], tmp[1], tmp[2]);
				1601	} else {
				1602	memcpy(nq, tmp, 3 * sizeof(felem));
				1603	skip = 0;
				1604	}
				1605	}
				1606	}
				1607	}
				1608	felem_assign(x_out, nq[0]);
				1609	felem_assign(y_out, nq[1]);
				1610	felem_assign(z_out, nq[2]);
				1611	}
				1612
				1613	/* Precomputation for the group generator. */
				1614	struct nistp521_pre_comp_st {
				1615	felem g_pre_comp[16][3];
				1616	CRYPTO_REF_COUNT references;
				1617	CRYPTO_RWLOCK *lock;
				1618	};
				1619
				1620	const EC_METHOD *EC_GFp_nistp521_method(void)
				1621	{
				1622	static const EC_METHOD ret = {
				1623	EC_FLAGS_DEFAULT_OCT,
				1624	NID_X9_62_prime_field,
				1625	ec_GFp_nistp521_group_init,
				1626	ec_GFp_simple_group_finish,
				1627	ec_GFp_simple_group_clear_finish,
				1628	ec_GFp_nist_group_copy,
				1629	ec_GFp_nistp521_group_set_curve,
				1630	ec_GFp_simple_group_get_curve,
				1631	ec_GFp_simple_group_get_degree,
				1632	ec_group_simple_order_bits,
				1633	ec_GFp_simple_group_check_discriminant,
				1634	ec_GFp_simple_point_init,
				1635	ec_GFp_simple_point_finish,
				1636	ec_GFp_simple_point_clear_finish,
				1637	ec_GFp_simple_point_copy,
				1638	ec_GFp_simple_point_set_to_infinity,
				1639	ec_GFp_simple_set_Jprojective_coordinates_GFp,
				1640	ec_GFp_simple_get_Jprojective_coordinates_GFp,
				1641	ec_GFp_simple_point_set_affine_coordinates,
				1642	ec_GFp_nistp521_point_get_affine_coordinates,
				1643	0 /* point_set_compressed_coordinates */ ,
				1644	0 /* point2oct */ ,
				1645	0 /* oct2point */ ,
				1646	ec_GFp_simple_add,
				1647	ec_GFp_simple_dbl,
				1648	ec_GFp_simple_invert,
				1649	ec_GFp_simple_is_at_infinity,
				1650	ec_GFp_simple_is_on_curve,
				1651	ec_GFp_simple_cmp,
				1652	ec_GFp_simple_make_affine,
				1653	ec_GFp_simple_points_make_affine,
				1654	ec_GFp_nistp521_points_mul,
				1655	ec_GFp_nistp521_precompute_mult,
				1656	ec_GFp_nistp521_have_precompute_mult,
				1657	ec_GFp_nist_field_mul,
				1658	ec_GFp_nist_field_sqr,
				1659	0 /* field_div */ ,
				1660	ec_GFp_simple_field_inv,
				1661	0 /* field_encode */ ,
				1662	0 /* field_decode */ ,
				1663	0, /* field_set_to_one */
				1664	ec_key_simple_priv2oct,
				1665	ec_key_simple_oct2priv,
				1666	0, /* set private */
				1667	ec_key_simple_generate_key,
				1668	ec_key_simple_check_key,
				1669	ec_key_simple_generate_public_key,
				1670	0, /* keycopy */
				1671	0, /* keyfinish */
				1672	ecdh_simple_compute_key,
				1673	0, /* field_inverse_mod_ord */
				1674	0, /* blind_coordinates */
				1675	0, /* ladder_pre */
				1676	0, /* ladder_step */
				1677	0 /* ladder_post */
				1678	};
				1679
				1680	return &ret;
				1681	}
				1682
				1683	/******************************************************************************/
				1684	/*
				1685	* FUNCTIONS TO MANAGE PRECOMPUTATION
				1686	*/
				1687
				1688	static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
				1689	{
				1690	NISTP521_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
				1691
				1692	if (ret == NULL) {
				1693	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
				1694	return ret;
				1695	}
				1696
				1697	ret->references = 1;
				1698
				1699	ret->lock = CRYPTO_THREAD_lock_new();
				1700	if (ret->lock == NULL) {
				1701	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
				1702	OPENSSL_free(ret);
				1703	return NULL;
				1704	}
				1705	return ret;
				1706	}
				1707
				1708	NISTP521_PRE_COMP EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP p)
				1709	{
				1710	int i;
				1711	if (p != NULL)
				1712	CRYPTO_UP_REF(&p->references, &i, p->lock);
				1713	return p;
				1714	}
				1715
				1716	void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
				1717	{
				1718	int i;
				1719
				1720	if (p == NULL)
				1721	return;
				1722
				1723	CRYPTO_DOWN_REF(&p->references, &i, p->lock);
				1724	REF_PRINT_COUNT("EC_nistp521", x);
				1725	if (i > 0)
				1726	return;
				1727	REF_ASSERT_ISNT(i < 0);
				1728
				1729	CRYPTO_THREAD_lock_free(p->lock);
				1730	OPENSSL_free(p);
				1731	}
				1732
				1733	/******************************************************************************/
				1734	/*
				1735	* OPENSSL EC_METHOD FUNCTIONS
				1736	*/
				1737
				1738	int ec_GFp_nistp521_group_init(EC_GROUP *group)
				1739	{
				1740	int ret;
				1741	ret = ec_GFp_simple_group_init(group);
				1742	group->a_is_minus3 = 1;
				1743	return ret;
				1744	}
				1745
				1746	int ec_GFp_nistp521_group_set_curve(EC_GROUP group, const BIGNUM p,
				1747	const BIGNUM a, const BIGNUM b,
				1748	BN_CTX *ctx)
				1749	{
				1750	int ret = 0;
				1751	BN_CTX *new_ctx = NULL;
				1752	BIGNUM curve_p, curve_a, *curve_b;
				1753
				1754	if (ctx == NULL)
				1755	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
				1756	return 0;
				1757	BN_CTX_start(ctx);
				1758	curve_p = BN_CTX_get(ctx);
				1759	curve_a = BN_CTX_get(ctx);
				1760	curve_b = BN_CTX_get(ctx);
				1761	if (curve_b == NULL)
				1762	goto err;
				1763	BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
				1764	BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
				1765	BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
				1766	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
				1767	ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
				1768	EC_R_WRONG_CURVE_PARAMETERS);
				1769	goto err;
				1770	}
				1771	group->field_mod_func = BN_nist_mod_521;
				1772	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
				1773	err:
				1774	BN_CTX_end(ctx);
				1775	BN_CTX_free(new_ctx);
				1776	return ret;
				1777	}
				1778
				1779	/*
				1780	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
				1781	* (X/Z^2, Y/Z^3)
				1782	*/
				1783	int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
				1784	const EC_POINT *point,
				1785	BIGNUM x, BIGNUM y,
				1786	BN_CTX *ctx)
				1787	{
				1788	felem z1, z2, x_in, y_in, x_out, y_out;
				1789	largefelem tmp;
				1790
				1791	if (EC_POINT_is_at_infinity(group, point)) {
				1792	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
				1793	EC_R_POINT_AT_INFINITY);
				1794	return 0;
				1795	}
				1796	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
				1797	(!BN_to_felem(z1, point->Z)))
				1798	return 0;
				1799	felem_inv(z2, z1);
				1800	felem_square(tmp, z2);
				1801	felem_reduce(z1, tmp);
				1802	felem_mul(tmp, x_in, z1);
				1803	felem_reduce(x_in, tmp);
				1804	felem_contract(x_out, x_in);
				1805	if (x != NULL) {
				1806	if (!felem_to_BN(x, x_out)) {
				1807	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
				1808	ERR_R_BN_LIB);
				1809	return 0;
				1810	}
				1811	}
				1812	felem_mul(tmp, z1, z2);
				1813	felem_reduce(z1, tmp);
				1814	felem_mul(tmp, y_in, z1);
				1815	felem_reduce(y_in, tmp);
				1816	felem_contract(y_out, y_in);
				1817	if (y != NULL) {
				1818	if (!felem_to_BN(y, y_out)) {
				1819	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
				1820	ERR_R_BN_LIB);
				1821	return 0;
				1822	}
				1823	}
				1824	return 1;
				1825	}
				1826
				1827	/* points below is of size \|num\|, and tmp_felems is of size \|num+1/ */
				1828	static void make_points_affine(size_t num, felem points[][3],
				1829	felem tmp_felems[])
				1830	{
				1831	/*
				1832	* Runs in constant time, unless an input is the point at infinity (which
				1833	* normally shouldn't happen).
				1834	*/
				1835	ec_GFp_nistp_points_make_affine_internal(num,
				1836	points,
				1837	sizeof(felem),
				1838	tmp_felems,
				1839	(void ()(void ))felem_one,
				1840	felem_is_zero_int,
				1841	(void ()(void , const void *))
				1842	felem_assign,
				1843	(void ()(void , const void *))
				1844	felem_square_reduce, (void (*)
				1845	(void *,
				1846	const void
				1847	*,
				1848	const void
				1849	*))
				1850	felem_mul_reduce,
				1851	(void ()(void , const void *))
				1852	felem_inv,
				1853	(void ()(void , const void *))
				1854	felem_contract);
				1855	}
				1856
				1857	/*
				1858	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
				1859	* values Result is stored in r (r can equal one of the inputs).
				1860	*/
				1861	int ec_GFp_nistp521_points_mul(const EC_GROUP group, EC_POINT r,
				1862	const BIGNUM *scalar, size_t num,
				1863	const EC_POINT *points[],
				1864	const BIGNUM scalars[], BN_CTX ctx)
				1865	{
				1866	int ret = 0;
				1867	int j;
				1868	int mixed = 0;
				1869	BIGNUM x, y, z, tmp_scalar;
				1870	felem_bytearray g_secret;
				1871	felem_bytearray *secrets = NULL;
				1872	felem (*pre_comp)[17][3] = NULL;
				1873	felem *tmp_felems = NULL;
				1874	unsigned i;
				1875	int num_bytes;
				1876	int have_pre_comp = 0;
				1877	size_t num_points = num;
				1878	felem x_in, y_in, z_in, x_out, y_out, z_out;
				1879	NISTP521_PRE_COMP *pre = NULL;
				1880	felem(*g_pre_comp)[3] = NULL;
				1881	EC_POINT *generator = NULL;
				1882	const EC_POINT *p = NULL;
				1883	const BIGNUM *p_scalar = NULL;
				1884
				1885	BN_CTX_start(ctx);
				1886	x = BN_CTX_get(ctx);
				1887	y = BN_CTX_get(ctx);
				1888	z = BN_CTX_get(ctx);
				1889	tmp_scalar = BN_CTX_get(ctx);
				1890	if (tmp_scalar == NULL)
				1891	goto err;
				1892
				1893	if (scalar != NULL) {
				1894	pre = group->pre_comp.nistp521;
				1895	if (pre)
				1896	/* we have precomputation, try to use it */
				1897	g_pre_comp = &pre->g_pre_comp[0];
				1898	else
				1899	/* try to use the standard precomputation */
				1900	g_pre_comp = (felem(*)[3]) gmul;
				1901	generator = EC_POINT_new(group);
				1902	if (generator == NULL)
				1903	goto err;
				1904	/* get the generator from precomputation */
				1905	if (!felem_to_BN(x, g_pre_comp[1][0]) \|\|
				1906	!felem_to_BN(y, g_pre_comp[1][1]) \|\|
				1907	!felem_to_BN(z, g_pre_comp[1][2])) {
				1908	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
				1909	goto err;
				1910	}
				1911	if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
				1912	generator, x, y, z,
				1913	ctx))
				1914	goto err;
				1915	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
				1916	/* precomputation matches generator */
				1917	have_pre_comp = 1;
				1918	else
				1919	/*
				1920	* we don't have valid precomputation: treat the generator as a
				1921	* random point
				1922	*/
				1923	num_points++;
				1924	}
				1925
				1926	if (num_points > 0) {
				1927	if (num_points >= 2) {
				1928	/*
				1929	* unless we precompute multiples for just one point, converting
				1930	* those into affine form is time well spent
				1931	*/
				1932	mixed = 1;
				1933	}
				1934	secrets = OPENSSL_zalloc(sizeof(secrets) num_points);
				1935	pre_comp = OPENSSL_zalloc(sizeof(pre_comp) num_points);
				1936	if (mixed)
				1937	tmp_felems =
				1938	OPENSSL_malloc(sizeof(tmp_felems) (num_points * 17 + 1));
				1939	if ((secrets == NULL) \|\| (pre_comp == NULL)
				1940	\|\| (mixed && (tmp_felems == NULL))) {
				1941	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
				1942	goto err;
				1943	}
				1944
				1945	/*
				1946	* we treat NULL scalars as 0, and NULL points as points at infinity,
				1947	* i.e., they contribute nothing to the linear combination
				1948	*/
				1949	for (i = 0; i < num_points; ++i) {
				1950	if (i == num) {
				1951	/*
				1952	* we didn't have a valid precomputation, so we pick the
				1953	* generator
				1954	*/
				1955	p = EC_GROUP_get0_generator(group);
				1956	p_scalar = scalar;
				1957	} else {
				1958	/* the i^th point */
				1959	p = points[i];
				1960	p_scalar = scalars[i];
				1961	}
				1962	if ((p_scalar != NULL) && (p != NULL)) {
				1963	/* reduce scalar to 0 <= scalar < 2^521 */
				1964	if ((BN_num_bits(p_scalar) > 521)
				1965	\|\| (BN_is_negative(p_scalar))) {
				1966	/*
				1967	* this is an unusual input, and we don't guarantee
				1968	* constant-timeness
				1969	*/
				1970	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
				1971	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
				1972	goto err;
				1973	}
				1974	num_bytes = BN_bn2lebinpad(tmp_scalar,
				1975	secrets[i], sizeof(secrets[i]));
				1976	} else {
				1977	num_bytes = BN_bn2lebinpad(p_scalar,
				1978	secrets[i], sizeof(secrets[i]));
				1979	}
				1980	if (num_bytes < 0) {
				1981	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
				1982	goto err;
				1983	}
				1984	/* precompute multiples */
				1985	if ((!BN_to_felem(x_out, p->X)) \|\|
				1986	(!BN_to_felem(y_out, p->Y)) \|\|
				1987	(!BN_to_felem(z_out, p->Z)))
				1988	goto err;
				1989	memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
				1990	memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
				1991	memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
				1992	for (j = 2; j <= 16; ++j) {
				1993	if (j & 1) {
				1994	point_add(pre_comp[i][j][0], pre_comp[i][j][1],
				1995	pre_comp[i][j][2], pre_comp[i][1][0],
				1996	pre_comp[i][1][1], pre_comp[i][1][2], 0,
				1997	pre_comp[i][j - 1][0],
				1998	pre_comp[i][j - 1][1],
				1999	pre_comp[i][j - 1][2]);
				2000	} else {
				2001	point_double(pre_comp[i][j][0], pre_comp[i][j][1],
				2002	pre_comp[i][j][2], pre_comp[i][j / 2][0],
				2003	pre_comp[i][j / 2][1],
				2004	pre_comp[i][j / 2][2]);
				2005	}
				2006	}
				2007	}
				2008	}
				2009	if (mixed)
				2010	make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
				2011	}
				2012
				2013	/* the scalar for the generator */
				2014	if ((scalar != NULL) && (have_pre_comp)) {
				2015	memset(g_secret, 0, sizeof(g_secret));
				2016	/* reduce scalar to 0 <= scalar < 2^521 */
				2017	if ((BN_num_bits(scalar) > 521) \|\| (BN_is_negative(scalar))) {
				2018	/*
				2019	* this is an unusual input, and we don't guarantee
				2020	* constant-timeness
				2021	*/
				2022	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
				2023	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
				2024	goto err;
				2025	}
				2026	num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
				2027	} else {
				2028	num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
				2029	}
				2030	/* do the multiplication with generator precomputation */
				2031	batch_mul(x_out, y_out, z_out,
				2032	(const felem_bytearray(*))secrets, num_points,
				2033	g_secret,
				2034	mixed, (const felem(*)[17][3])pre_comp,
				2035	(const felem(*)[3])g_pre_comp);
				2036	} else {
				2037	/* do the multiplication without generator precomputation */
				2038	batch_mul(x_out, y_out, z_out,
				2039	(const felem_bytearray(*))secrets, num_points,
				2040	NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
				2041	}
				2042	/* reduce the output to its unique minimal representation */
				2043	felem_contract(x_in, x_out);
				2044	felem_contract(y_in, y_out);
				2045	felem_contract(z_in, z_out);
				2046	if ((!felem_to_BN(x, x_in)) \|\| (!felem_to_BN(y, y_in)) \|\|
				2047	(!felem_to_BN(z, z_in))) {
				2048	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
				2049	goto err;
				2050	}
				2051	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
				2052
				2053	err:
				2054	BN_CTX_end(ctx);
				2055	EC_POINT_free(generator);
				2056	OPENSSL_free(secrets);
				2057	OPENSSL_free(pre_comp);
				2058	OPENSSL_free(tmp_felems);
				2059	return ret;
				2060	}
				2061
				2062	int ec_GFp_nistp521_precompute_mult(EC_GROUP group, BN_CTX ctx)
				2063	{
				2064	int ret = 0;
				2065	NISTP521_PRE_COMP *pre = NULL;
				2066	int i, j;
				2067	BN_CTX *new_ctx = NULL;
				2068	BIGNUM x, y;
				2069	EC_POINT *generator = NULL;
				2070	felem tmp_felems[16];
				2071
				2072	/* throw away old precomputation */
				2073	EC_pre_comp_free(group);
				2074	if (ctx == NULL)
				2075	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
				2076	return 0;
				2077	BN_CTX_start(ctx);
				2078	x = BN_CTX_get(ctx);
				2079	y = BN_CTX_get(ctx);
				2080	if (y == NULL)
				2081	goto err;
				2082	/* get the generator */
				2083	if (group->generator == NULL)
				2084	goto err;
				2085	generator = EC_POINT_new(group);
				2086	if (generator == NULL)
				2087	goto err;
				2088	BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
				2089	BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
				2090	if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
				2091	goto err;
				2092	if ((pre = nistp521_pre_comp_new()) == NULL)
				2093	goto err;
				2094	/*
				2095	* if the generator is the standard one, use built-in precomputation
				2096	*/
				2097	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
				2098	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
				2099	goto done;
				2100	}
				2101	if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) \|\|
				2102	(!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) \|\|
				2103	(!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
				2104	goto err;
				2105	/* compute 2^130G, 2^260G, 2^390G /
				2106	for (i = 1; i <= 4; i <<= 1) {
				2107	point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
				2108	pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
				2109	pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
				2110	for (j = 0; j < 129; ++j) {
				2111	point_double(pre->g_pre_comp[2 * i][0],
				2112	pre->g_pre_comp[2 * i][1],
				2113	pre->g_pre_comp[2 * i][2],
				2114	pre->g_pre_comp[2 * i][0],
				2115	pre->g_pre_comp[2 * i][1],
				2116	pre->g_pre_comp[2 * i][2]);
				2117	}
				2118	}
				2119	/* g_pre_comp[0] is the point at infinity */
				2120	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
				2121	/* the remaining multiples */
				2122	/* 2^130G + 2^260G */
				2123	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
				2124	pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
				2125	pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
				2126	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
				2127	pre->g_pre_comp[2][2]);
				2128	/* 2^130G + 2^390G */
				2129	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
				2130	pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
				2131	pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
				2132	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
				2133	pre->g_pre_comp[2][2]);
				2134	/* 2^260G + 2^390G */
				2135	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
				2136	pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
				2137	pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
				2138	0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
				2139	pre->g_pre_comp[4][2]);
				2140	/* 2^130G + 2^260G + 2^390G /
				2141	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
				2142	pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
				2143	pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
				2144	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
				2145	pre->g_pre_comp[2][2]);
				2146	for (i = 1; i < 8; ++i) {
				2147	/* odd multiples: add G */
				2148	point_add(pre->g_pre_comp[2 * i + 1][0],
				2149	pre->g_pre_comp[2 * i + 1][1],
				2150	pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
				2151	pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
				2152	pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
				2153	pre->g_pre_comp[1][2]);
				2154	}
				2155	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
				2156
				2157	done:
				2158	SETPRECOMP(group, nistp521, pre);
				2159	ret = 1;
				2160	pre = NULL;
				2161	err:
				2162	BN_CTX_end(ctx);
				2163	EC_POINT_free(generator);
				2164	BN_CTX_free(new_ctx);
				2165	EC_nistp521_pre_comp_free(pre);
				2166	return ret;
				2167	}
				2168
				2169	int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
				2170	{
				2171	return HAVEPRECOMP(group, nistp521);
				2172	}
				2173
				2174	#endif