Blame - ap/lib/libssl/openssl-1.1.1o/crypto/ec/ecp_nistp256.c - R306

blob: e23e9d2a0b34836c49e2f8af5af6b071edb1dac1 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	*
				4	* Licensed under the OpenSSL license (the "License"). You may not use
				5	* this file except in compliance with the License. You can obtain a copy
				6	* in the file LICENSE in the source distribution or at
				7	* https://www.openssl.org/source/license.html
				8	*/
				9
				10	/* Copyright 2011 Google Inc.
				11	*
				12	* Licensed under the Apache License, Version 2.0 (the "License");
				13	*
				14	* you may not use this file except in compliance with the License.
				15	* You may obtain a copy of the License at
				16	*
				17	* http://www.apache.org/licenses/LICENSE-2.0
				18	*
				19	* Unless required by applicable law or agreed to in writing, software
				20	* distributed under the License is distributed on an "AS IS" BASIS,
				21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				22	* See the License for the specific language governing permissions and
				23	* limitations under the License.
				24	*/
				25
				26	/*
				27	* A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
				28	*
				29	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
				30	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
				31	* work which got its smarts from Daniel J. Bernstein's work on the same.
				32	*/
				33
				34	#include <openssl/opensslconf.h>
				35	#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
				36	NON_EMPTY_TRANSLATION_UNIT
				37	#else
				38
				39	# include <stdint.h>
				40	# include <string.h>
				41	# include <openssl/err.h>
				42	# include "ec_local.h"
				43
				44	# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
				45	/* even with gcc, the typedef won't work for 32-bit platforms */
				46	typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
				47	* platforms */
				48	typedef __int128_t int128_t;
				49	# else
				50	# error "Your compiler doesn't appear to support 128-bit integer types"
				51	# endif
				52
				53	typedef uint8_t u8;
				54	typedef uint32_t u32;
				55	typedef uint64_t u64;
				56
				57	/*
				58	* The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
				59	* can serialise an element of this field into 32 bytes. We call this an
				60	* felem_bytearray.
				61	*/
				62
				63	typedef u8 felem_bytearray[32];
				64
				65	/*
				66	* These are the parameters of P256, taken from FIPS 186-3, page 86. These
				67	* values are big-endian.
				68	*/
				69	static const felem_bytearray nistp256_curve_params[5] = {
				70	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
				71	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
				72	0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
				73	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
				74	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
				75	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
				76	0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
				77	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},
				78	{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, /* b */
				79	0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
				80	0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
				81	0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
				82	{0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
				83	0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
				84	0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
				85	0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
				86	{0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
				87	0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
				88	0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
				89	0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
				90	};
				91
				92	/*-
				93	* The representation of field elements.
				94	* ------------------------------------
				95	*
				96	* We represent field elements with either four 128-bit values, eight 128-bit
				97	* values, or four 64-bit values. The field element represented is:
				98	* v[0]2^0 + v[1]2^64 + v[2]2^128 + v[3]2^192 (mod p)
				99	* or:
				100	* v[0]2^0 + v[1]2^64 + v[2]2^128 + ... + v[8]2^512 (mod p)
				101	*
				102	* 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
				103	* apart, but are 128-bits wide, the most significant bits of each limb overlap
				104	* with the least significant bits of the next.
				105	*
				106	* A field element with four limbs is an 'felem'. One with eight limbs is a
				107	* 'longfelem'
				108	*
				109	* A field element with four, 64-bit values is called a 'smallfelem'. Small
				110	* values are used as intermediate values before multiplication.
				111	*/
				112
				113	# define NLIMBS 4
				114
				115	typedef uint128_t limb;
				116	typedef limb felem[NLIMBS];
				117	typedef limb longfelem[NLIMBS * 2];
				118	typedef u64 smallfelem[NLIMBS];
				119
				120	/* This is the value of the prime as four 64-bit words, little-endian. */
				121	static const u64 kPrime[4] =
				122	{ 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
				123	static const u64 bottom63bits = 0x7ffffffffffffffful;
				124
				125	/*
				126	* bin32_to_felem takes a little-endian byte array and converts it into felem
				127	* form. This assumes that the CPU is little-endian.
				128	*/
				129	static void bin32_to_felem(felem out, const u8 in[32])
				130	{
				131	out[0] = ((u64 )&in[0]);
				132	out[1] = ((u64 )&in[8]);
				133	out[2] = ((u64 )&in[16]);
				134	out[3] = ((u64 )&in[24]);
				135	}
				136
				137	/*
				138	* smallfelem_to_bin32 takes a smallfelem and serialises into a little
				139	* endian, 32 byte array. This assumes that the CPU is little-endian.
				140	*/
				141	static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
				142	{
				143	((u64 )&out[0]) = in[0];
				144	((u64 )&out[8]) = in[1];
				145	((u64 )&out[16]) = in[2];
				146	((u64 )&out[24]) = in[3];
				147	}
				148
				149	/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
				150	static int BN_to_felem(felem out, const BIGNUM *bn)
				151	{
				152	felem_bytearray b_out;
				153	int num_bytes;
				154
				155	if (BN_is_negative(bn)) {
				156	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
				157	return 0;
				158	}
				159	num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
				160	if (num_bytes < 0) {
				161	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
				162	return 0;
				163	}
				164	bin32_to_felem(out, b_out);
				165	return 1;
				166	}
				167
				168	/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
				169	static BIGNUM smallfelem_to_BN(BIGNUM out, const smallfelem in)
				170	{
				171	felem_bytearray b_out;
				172	smallfelem_to_bin32(b_out, in);
				173	return BN_lebin2bn(b_out, sizeof(b_out), out);
				174	}
				175
				176	/*-
				177	* Field operations
				178	* ----------------
				179	*/
				180
				181	static void smallfelem_one(smallfelem out)
				182	{
				183	out[0] = 1;
				184	out[1] = 0;
				185	out[2] = 0;
				186	out[3] = 0;
				187	}
				188
				189	static void smallfelem_assign(smallfelem out, const smallfelem in)
				190	{
				191	out[0] = in[0];
				192	out[1] = in[1];
				193	out[2] = in[2];
				194	out[3] = in[3];
				195	}
				196
				197	static void felem_assign(felem out, const felem in)
				198	{
				199	out[0] = in[0];
				200	out[1] = in[1];
				201	out[2] = in[2];
				202	out[3] = in[3];
				203	}
				204
				205	/* felem_sum sets out = out + in. */
				206	static void felem_sum(felem out, const felem in)
				207	{
				208	out[0] += in[0];
				209	out[1] += in[1];
				210	out[2] += in[2];
				211	out[3] += in[3];
				212	}
				213
				214	/* felem_small_sum sets out = out + in. */
				215	static void felem_small_sum(felem out, const smallfelem in)
				216	{
				217	out[0] += in[0];
				218	out[1] += in[1];
				219	out[2] += in[2];
				220	out[3] += in[3];
				221	}
				222
				223	/* felem_scalar sets out = out * scalar */
				224	static void felem_scalar(felem out, const u64 scalar)
				225	{
				226	out[0] *= scalar;
				227	out[1] *= scalar;
				228	out[2] *= scalar;
				229	out[3] *= scalar;
				230	}
				231
				232	/* longfelem_scalar sets out = out * scalar */
				233	static void longfelem_scalar(longfelem out, const u64 scalar)
				234	{
				235	out[0] *= scalar;
				236	out[1] *= scalar;
				237	out[2] *= scalar;
				238	out[3] *= scalar;
				239	out[4] *= scalar;
				240	out[5] *= scalar;
				241	out[6] *= scalar;
				242	out[7] *= scalar;
				243	}
				244
				245	# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
				246	# define two105 (((limb)1) << 105)
				247	# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
				248
				249	/* zero105 is 0 mod p */
				250	static const felem zero105 =
				251	{ two105m41m9, two105, two105m41p9, two105m41p9 };
				252
				253	/*-
				254	* smallfelem_neg sets \|out\| to \|-small\|
				255	* On exit:
				256	* out[i] < out[i] + 2^105
				257	*/
				258	static void smallfelem_neg(felem out, const smallfelem small)
				259	{
				260	/* In order to prevent underflow, we subtract from 0 mod p. */
				261	out[0] = zero105[0] - small[0];
				262	out[1] = zero105[1] - small[1];
				263	out[2] = zero105[2] - small[2];
				264	out[3] = zero105[3] - small[3];
				265	}
				266
				267	/*-
				268	* felem_diff subtracts \|in\| from \|out\|
				269	* On entry:
				270	* in[i] < 2^104
				271	* On exit:
				272	* out[i] < out[i] + 2^105
				273	*/
				274	static void felem_diff(felem out, const felem in)
				275	{
				276	/*
				277	* In order to prevent underflow, we add 0 mod p before subtracting.
				278	*/
				279	out[0] += zero105[0];
				280	out[1] += zero105[1];
				281	out[2] += zero105[2];
				282	out[3] += zero105[3];
				283
				284	out[0] -= in[0];
				285	out[1] -= in[1];
				286	out[2] -= in[2];
				287	out[3] -= in[3];
				288	}
				289
				290	# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
				291	# define two107 (((limb)1) << 107)
				292	# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
				293
				294	/* zero107 is 0 mod p */
				295	static const felem zero107 =
				296	{ two107m43m11, two107, two107m43p11, two107m43p11 };
				297
				298	/*-
				299	* An alternative felem_diff for larger inputs \|in\|
				300	* felem_diff_zero107 subtracts \|in\| from \|out\|
				301	* On entry:
				302	* in[i] < 2^106
				303	* On exit:
				304	* out[i] < out[i] + 2^107
				305	*/
				306	static void felem_diff_zero107(felem out, const felem in)
				307	{
				308	/*
				309	* In order to prevent underflow, we add 0 mod p before subtracting.
				310	*/
				311	out[0] += zero107[0];
				312	out[1] += zero107[1];
				313	out[2] += zero107[2];
				314	out[3] += zero107[3];
				315
				316	out[0] -= in[0];
				317	out[1] -= in[1];
				318	out[2] -= in[2];
				319	out[3] -= in[3];
				320	}
				321
				322	/*-
				323	* longfelem_diff subtracts \|in\| from \|out\|
				324	* On entry:
				325	* in[i] < 7*2^67
				326	* On exit:
				327	* out[i] < out[i] + 2^70 + 2^40
				328	*/
				329	static void longfelem_diff(longfelem out, const longfelem in)
				330	{
				331	static const limb two70m8p6 =
				332	(((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
				333	static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
				334	static const limb two70 = (((limb) 1) << 70);
				335	static const limb two70m40m38p6 =
				336	(((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
				337	(((limb) 1) << 6);
				338	static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
				339
				340	/* add 0 mod p to avoid underflow */
				341	out[0] += two70m8p6;
				342	out[1] += two70p40;
				343	out[2] += two70;
				344	out[3] += two70m40m38p6;
				345	out[4] += two70m6;
				346	out[5] += two70m6;
				347	out[6] += two70m6;
				348	out[7] += two70m6;
				349
				350	/* in[i] < 72^67 < 2^70 - 2^40 - 2^38 + 2^6 /
				351	out[0] -= in[0];
				352	out[1] -= in[1];
				353	out[2] -= in[2];
				354	out[3] -= in[3];
				355	out[4] -= in[4];
				356	out[5] -= in[5];
				357	out[6] -= in[6];
				358	out[7] -= in[7];
				359	}
				360
				361	# define two64m0 (((limb)1) << 64) - 1
				362	# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
				363	# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
				364	# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
				365
				366	/* zero110 is 0 mod p */
				367	static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
				368
				369	/*-
				370	* felem_shrink converts an felem into a smallfelem. The result isn't quite
				371	* minimal as the value may be greater than p.
				372	*
				373	* On entry:
				374	* in[i] < 2^109
				375	* On exit:
				376	* out[i] < 2^64
				377	*/
				378	static void felem_shrink(smallfelem out, const felem in)
				379	{
				380	felem tmp;
				381	u64 a, b, mask;
				382	u64 high, low;
				383	static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
				384
				385	/* Carry 2->3 */
				386	tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
				387	/* tmp[3] < 2^110 */
				388
				389	tmp[2] = zero110[2] + (u64)in[2];
				390	tmp[0] = zero110[0] + in[0];
				391	tmp[1] = zero110[1] + in[1];
				392	/* tmp[0] < 2110, tmp[1] < 2^111, tmp[2] < 265 */
				393
				394	/*
				395	* We perform two partial reductions where we eliminate the high-word of
				396	* tmp[3]. We don't update the other words till the end.
				397	*/
				398	a = tmp[3] >> 64; /* a < 2^46 */
				399	tmp[3] = (u64)tmp[3];
				400	tmp[3] -= a;
				401	tmp[3] += ((limb) a) << 32;
				402	/* tmp[3] < 2^79 */
				403
				404	b = a;
				405	a = tmp[3] >> 64; /* a < 2^15 */
				406	b += a; /* b < 2^46 + 2^15 < 2^47 */
				407	tmp[3] = (u64)tmp[3];
				408	tmp[3] -= a;
				409	tmp[3] += ((limb) a) << 32;
				410	/* tmp[3] < 2^64 + 2^47 */
				411
				412	/*
				413	* This adjusts the other two words to complete the two partial
				414	* reductions.
				415	*/
				416	tmp[0] += b;
				417	tmp[1] -= (((limb) b) << 32);
				418
				419	/*
				420	* In order to make space in tmp[3] for the carry from 2 -> 3, we
				421	* conditionally subtract kPrime if tmp[3] is large enough.
				422	*/
				423	high = (u64)(tmp[3] >> 64);
				424	/* As tmp[3] < 2^65, high is either 1 or 0 */
				425	high = 0 - high;
				426	/*-
				427	* high is:
				428	* all ones if the high word of tmp[3] is 1
				429	* all zeros if the high word of tmp[3] if 0
				430	*/
				431	low = (u64)tmp[3];
				432	mask = 0 - (low >> 63);
				433	/*-
				434	* mask is:
				435	* all ones if the MSB of low is 1
				436	* all zeros if the MSB of low if 0
				437	*/
				438	low &= bottom63bits;
				439	low -= kPrime3Test;
				440	/* if low was greater than kPrime3Test then the MSB is zero */
				441	low = ~low;
				442	low = 0 - (low >> 63);
				443	/*-
				444	* low is:
				445	* all ones if low was > kPrime3Test
				446	* all zeros if low was <= kPrime3Test
				447	*/
				448	mask = (mask & low) \| high;
				449	tmp[0] -= mask & kPrime[0];
				450	tmp[1] -= mask & kPrime[1];
				451	/* kPrime[2] is zero, so omitted */
				452	tmp[3] -= mask & kPrime[3];
				453	/* tmp[3] < 264 - 232 + 1 */
				454
				455	tmp[1] += ((u64)(tmp[0] >> 64));
				456	tmp[0] = (u64)tmp[0];
				457	tmp[2] += ((u64)(tmp[1] >> 64));
				458	tmp[1] = (u64)tmp[1];
				459	tmp[3] += ((u64)(tmp[2] >> 64));
				460	tmp[2] = (u64)tmp[2];
				461	/* tmp[i] < 2^64 */
				462
				463	out[0] = tmp[0];
				464	out[1] = tmp[1];
				465	out[2] = tmp[2];
				466	out[3] = tmp[3];
				467	}
				468
				469	/* smallfelem_expand converts a smallfelem to an felem */
				470	static void smallfelem_expand(felem out, const smallfelem in)
				471	{
				472	out[0] = in[0];
				473	out[1] = in[1];
				474	out[2] = in[2];
				475	out[3] = in[3];
				476	}
				477
				478	/*-
				479	* smallfelem_square sets \|out\| = \|small\|^2
				480	* On entry:
				481	* small[i] < 2^64
				482	* On exit:
				483	* out[i] < 7 * 2^64 < 2^67
				484	*/
				485	static void smallfelem_square(longfelem out, const smallfelem small)
				486	{
				487	limb a;
				488	u64 high, low;
				489
				490	a = ((uint128_t) small[0]) * small[0];
				491	low = a;
				492	high = a >> 64;
				493	out[0] = low;
				494	out[1] = high;
				495
				496	a = ((uint128_t) small[0]) * small[1];
				497	low = a;
				498	high = a >> 64;
				499	out[1] += low;
				500	out[1] += low;
				501	out[2] = high;
				502
				503	a = ((uint128_t) small[0]) * small[2];
				504	low = a;
				505	high = a >> 64;
				506	out[2] += low;
				507	out[2] *= 2;
				508	out[3] = high;
				509
				510	a = ((uint128_t) small[0]) * small[3];
				511	low = a;
				512	high = a >> 64;
				513	out[3] += low;
				514	out[4] = high;
				515
				516	a = ((uint128_t) small[1]) * small[2];
				517	low = a;
				518	high = a >> 64;
				519	out[3] += low;
				520	out[3] *= 2;
				521	out[4] += high;
				522
				523	a = ((uint128_t) small[1]) * small[1];
				524	low = a;
				525	high = a >> 64;
				526	out[2] += low;
				527	out[3] += high;
				528
				529	a = ((uint128_t) small[1]) * small[3];
				530	low = a;
				531	high = a >> 64;
				532	out[4] += low;
				533	out[4] *= 2;
				534	out[5] = high;
				535
				536	a = ((uint128_t) small[2]) * small[3];
				537	low = a;
				538	high = a >> 64;
				539	out[5] += low;
				540	out[5] *= 2;
				541	out[6] = high;
				542	out[6] += high;
				543
				544	a = ((uint128_t) small[2]) * small[2];
				545	low = a;
				546	high = a >> 64;
				547	out[4] += low;
				548	out[5] += high;
				549
				550	a = ((uint128_t) small[3]) * small[3];
				551	low = a;
				552	high = a >> 64;
				553	out[6] += low;
				554	out[7] = high;
				555	}
				556
				557	/*-
				558	* felem_square sets \|out\| = \|in\|^2
				559	* On entry:
				560	* in[i] < 2^109
				561	* On exit:
				562	* out[i] < 7 * 2^64 < 2^67
				563	*/
				564	static void felem_square(longfelem out, const felem in)
				565	{
				566	u64 small[4];
				567	felem_shrink(small, in);
				568	smallfelem_square(out, small);
				569	}
				570
				571	/*-
				572	* smallfelem_mul sets \|out\| = \|small1\| * \|small2\|
				573	* On entry:
				574	* small1[i] < 2^64
				575	* small2[i] < 2^64
				576	* On exit:
				577	* out[i] < 7 * 2^64 < 2^67
				578	*/
				579	static void smallfelem_mul(longfelem out, const smallfelem small1,
				580	const smallfelem small2)
				581	{
				582	limb a;
				583	u64 high, low;
				584
				585	a = ((uint128_t) small1[0]) * small2[0];
				586	low = a;
				587	high = a >> 64;
				588	out[0] = low;
				589	out[1] = high;
				590
				591	a = ((uint128_t) small1[0]) * small2[1];
				592	low = a;
				593	high = a >> 64;
				594	out[1] += low;
				595	out[2] = high;
				596
				597	a = ((uint128_t) small1[1]) * small2[0];
				598	low = a;
				599	high = a >> 64;
				600	out[1] += low;
				601	out[2] += high;
				602
				603	a = ((uint128_t) small1[0]) * small2[2];
				604	low = a;
				605	high = a >> 64;
				606	out[2] += low;
				607	out[3] = high;
				608
				609	a = ((uint128_t) small1[1]) * small2[1];
				610	low = a;
				611	high = a >> 64;
				612	out[2] += low;
				613	out[3] += high;
				614
				615	a = ((uint128_t) small1[2]) * small2[0];
				616	low = a;
				617	high = a >> 64;
				618	out[2] += low;
				619	out[3] += high;
				620
				621	a = ((uint128_t) small1[0]) * small2[3];
				622	low = a;
				623	high = a >> 64;
				624	out[3] += low;
				625	out[4] = high;
				626
				627	a = ((uint128_t) small1[1]) * small2[2];
				628	low = a;
				629	high = a >> 64;
				630	out[3] += low;
				631	out[4] += high;
				632
				633	a = ((uint128_t) small1[2]) * small2[1];
				634	low = a;
				635	high = a >> 64;
				636	out[3] += low;
				637	out[4] += high;
				638
				639	a = ((uint128_t) small1[3]) * small2[0];
				640	low = a;
				641	high = a >> 64;
				642	out[3] += low;
				643	out[4] += high;
				644
				645	a = ((uint128_t) small1[1]) * small2[3];
				646	low = a;
				647	high = a >> 64;
				648	out[4] += low;
				649	out[5] = high;
				650
				651	a = ((uint128_t) small1[2]) * small2[2];
				652	low = a;
				653	high = a >> 64;
				654	out[4] += low;
				655	out[5] += high;
				656
				657	a = ((uint128_t) small1[3]) * small2[1];
				658	low = a;
				659	high = a >> 64;
				660	out[4] += low;
				661	out[5] += high;
				662
				663	a = ((uint128_t) small1[2]) * small2[3];
				664	low = a;
				665	high = a >> 64;
				666	out[5] += low;
				667	out[6] = high;
				668
				669	a = ((uint128_t) small1[3]) * small2[2];
				670	low = a;
				671	high = a >> 64;
				672	out[5] += low;
				673	out[6] += high;
				674
				675	a = ((uint128_t) small1[3]) * small2[3];
				676	low = a;
				677	high = a >> 64;
				678	out[6] += low;
				679	out[7] = high;
				680	}
				681
				682	/*-
				683	* felem_mul sets \|out\| = \|in1\| * \|in2\|
				684	* On entry:
				685	* in1[i] < 2^109
				686	* in2[i] < 2^109
				687	* On exit:
				688	* out[i] < 7 * 2^64 < 2^67
				689	*/
				690	static void felem_mul(longfelem out, const felem in1, const felem in2)
				691	{
				692	smallfelem small1, small2;
				693	felem_shrink(small1, in1);
				694	felem_shrink(small2, in2);
				695	smallfelem_mul(out, small1, small2);
				696	}
				697
				698	/*-
				699	* felem_small_mul sets \|out\| = \|small1\| * \|in2\|
				700	* On entry:
				701	* small1[i] < 2^64
				702	* in2[i] < 2^109
				703	* On exit:
				704	* out[i] < 7 * 2^64 < 2^67
				705	*/
				706	static void felem_small_mul(longfelem out, const smallfelem small1,
				707	const felem in2)
				708	{
				709	smallfelem small2;
				710	felem_shrink(small2, in2);
				711	smallfelem_mul(out, small1, small2);
				712	}
				713
				714	# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
				715	# define two100 (((limb)1) << 100)
				716	# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
				717	/* zero100 is 0 mod p */
				718	static const felem zero100 =
				719	{ two100m36m4, two100, two100m36p4, two100m36p4 };
				720
				721	/*-
				722	* Internal function for the different flavours of felem_reduce.
				723	* felem_reduce_ reduces the higher coefficients in[4]-in[7].
				724	* On entry:
				725	* out[0] >= in[6] + 2^32in[6] + in[7] + 2^32in[7]
				726	* out[1] >= in[7] + 2^32*in[4]
				727	* out[2] >= in[5] + 2^32*in[5]
				728	* out[3] >= in[4] + 2^32in[5] + 2^32in[6]
				729	* On exit:
				730	* out[0] <= out[0] + in[4] + 2^32*in[5]
				731	* out[1] <= out[1] + in[5] + 2^33*in[6]
				732	* out[2] <= out[2] + in[7] + 2in[6] + 2^33in[7]
				733	* out[3] <= out[3] + 2^32in[4] + 3in[7]
				734	*/
				735	static void felem_reduce_(felem out, const longfelem in)
				736	{
				737	int128_t c;
				738	/* combine common terms from below */
				739	c = in[4] + (in[5] << 32);
				740	out[0] += c;
				741	out[3] -= c;
				742
				743	c = in[5] - in[7];
				744	out[1] += c;
				745	out[2] -= c;
				746
				747	/* the remaining terms */
				748	/* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
				749	out[1] -= (in[4] << 32);
				750	out[3] += (in[4] << 32);
				751
				752	/* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
				753	out[2] -= (in[5] << 32);
				754
				755	/* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
				756	out[0] -= in[6];
				757	out[0] -= (in[6] << 32);
				758	out[1] += (in[6] << 33);
				759	out[2] += (in[6] * 2);
				760	out[3] -= (in[6] << 32);
				761
				762	/* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
				763	out[0] -= in[7];
				764	out[0] -= (in[7] << 32);
				765	out[2] += (in[7] << 33);
				766	out[3] += (in[7] * 3);
				767	}
				768
				769	/*-
				770	* felem_reduce converts a longfelem into an felem.
				771	* To be called directly after felem_square or felem_mul.
				772	* On entry:
				773	* in[0] < 2^64, in[1] < 32^64, in[2] < 52^64, in[3] < 7*2^64
				774	* in[4] < 72^64, in[5] < 52^64, in[6] < 32^64, in[7] < 264
				775	* On exit:
				776	* out[i] < 2^101
				777	*/
				778	static void felem_reduce(felem out, const longfelem in)
				779	{
				780	out[0] = zero100[0] + in[0];
				781	out[1] = zero100[1] + in[1];
				782	out[2] = zero100[2] + in[2];
				783	out[3] = zero100[3] + in[3];
				784
				785	felem_reduce_(out, in);
				786
				787	/*-
				788	* out[0] > 2^100 - 2^36 - 2^4 - 32^64 - 32^96 - 2^64 - 2^96 > 0
				789	* out[1] > 2^100 - 2^64 - 7*2^96 > 0
				790	* out[2] > 2^100 - 2^36 + 2^4 - 52^64 - 52^96 > 0
				791	* out[3] > 2^100 - 2^36 + 2^4 - 72^64 - 52^96 - 3*2^96 > 0
				792	*
				793	* out[0] < 2^100 + 2^64 + 72^64 + 52^96 < 2^101
				794	* out[1] < 2^100 + 32^64 + 52^64 + 3*2^97 < 2^101
				795	* out[2] < 2^100 + 52^64 + 2^64 + 32^65 + 2^97 < 2^101
				796	* out[3] < 2^100 + 72^64 + 72^96 + 3*2^64 < 2^101
				797	*/
				798	}
				799
				800	/*-
				801	* felem_reduce_zero105 converts a larger longfelem into an felem.
				802	* On entry:
				803	* in[0] < 2^71
				804	* On exit:
				805	* out[i] < 2^106
				806	*/
				807	static void felem_reduce_zero105(felem out, const longfelem in)
				808	{
				809	out[0] = zero105[0] + in[0];
				810	out[1] = zero105[1] + in[1];
				811	out[2] = zero105[2] + in[2];
				812	out[3] = zero105[3] + in[3];
				813
				814	felem_reduce_(out, in);
				815
				816	/*-
				817	* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
				818	* out[1] > 2^105 - 2^71 - 2^103 > 0
				819	* out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
				820	* out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
				821	*
				822	* out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
				823	* out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
				824	* out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
				825	* out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
				826	*/
				827	}
				828
				829	/*
				830	* subtract_u64 sets result = result - v and *carry to one if the
				831	* subtraction underflowed.
				832	*/
				833	static void subtract_u64(u64 result, u64 carry, u64 v)
				834	{
				835	uint128_t r = *result;
				836	r -= v;
				837	*carry = (r >> 64) & 1;
				838	*result = (u64)r;
				839	}
				840
				841	/*
				842	* felem_contract converts \|in\| to its unique, minimal representation. On
				843	* entry: in[i] < 2^109
				844	*/
				845	static void felem_contract(smallfelem out, const felem in)
				846	{
				847	unsigned i;
				848	u64 all_equal_so_far = 0, result = 0, carry;
				849
				850	felem_shrink(out, in);
				851	/* small is minimal except that the value might be > p */
				852
				853	all_equal_so_far--;
				854	/*
				855	* We are doing a constant time test if out >= kPrime. We need to compare
				856	* each u64, from most-significant to least significant. For each one, if
				857	* all words so far have been equal (m is all ones) then a non-equal
				858	* result is the answer. Otherwise we continue.
				859	*/
				860	for (i = 3; i < 4; i--) {
				861	u64 equal;
				862	uint128_t a = ((uint128_t) kPrime[i]) - out[i];
				863	/*
				864	* if out[i] > kPrime[i] then a will underflow and the high 64-bits
				865	* will all be set.
				866	*/
				867	result \|= all_equal_so_far & ((u64)(a >> 64));
				868
				869	/*
				870	* if kPrime[i] == out[i] then \|equal\| will be all zeros and the
				871	* decrement will make it all ones.
				872	*/
				873	equal = kPrime[i] ^ out[i];
				874	equal--;
				875	equal &= equal << 32;
				876	equal &= equal << 16;
				877	equal &= equal << 8;
				878	equal &= equal << 4;
				879	equal &= equal << 2;
				880	equal &= equal << 1;
				881	equal = 0 - (equal >> 63);
				882
				883	all_equal_so_far &= equal;
				884	}
				885
				886	/*
				887	* if all_equal_so_far is still all ones then the two values are equal
				888	* and so out >= kPrime is true.
				889	*/
				890	result \|= all_equal_so_far;
				891
				892	/* if out >= kPrime then we subtract kPrime. */
				893	subtract_u64(&out[0], &carry, result & kPrime[0]);
				894	subtract_u64(&out[1], &carry, carry);
				895	subtract_u64(&out[2], &carry, carry);
				896	subtract_u64(&out[3], &carry, carry);
				897
				898	subtract_u64(&out[1], &carry, result & kPrime[1]);
				899	subtract_u64(&out[2], &carry, carry);
				900	subtract_u64(&out[3], &carry, carry);
				901
				902	subtract_u64(&out[2], &carry, result & kPrime[2]);
				903	subtract_u64(&out[3], &carry, carry);
				904
				905	subtract_u64(&out[3], &carry, result & kPrime[3]);
				906	}
				907
				908	static void smallfelem_square_contract(smallfelem out, const smallfelem in)
				909	{
				910	longfelem longtmp;
				911	felem tmp;
				912
				913	smallfelem_square(longtmp, in);
				914	felem_reduce(tmp, longtmp);
				915	felem_contract(out, tmp);
				916	}
				917
				918	static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
				919	const smallfelem in2)
				920	{
				921	longfelem longtmp;
				922	felem tmp;
				923
				924	smallfelem_mul(longtmp, in1, in2);
				925	felem_reduce(tmp, longtmp);
				926	felem_contract(out, tmp);
				927	}
				928
				929	/*-
				930	* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
				931	* otherwise.
				932	* On entry:
				933	* small[i] < 2^64
				934	*/
				935	static limb smallfelem_is_zero(const smallfelem small)
				936	{
				937	limb result;
				938	u64 is_p;
				939
				940	u64 is_zero = small[0] \| small[1] \| small[2] \| small[3];
				941	is_zero--;
				942	is_zero &= is_zero << 32;
				943	is_zero &= is_zero << 16;
				944	is_zero &= is_zero << 8;
				945	is_zero &= is_zero << 4;
				946	is_zero &= is_zero << 2;
				947	is_zero &= is_zero << 1;
				948	is_zero = 0 - (is_zero >> 63);
				949
				950	is_p = (small[0] ^ kPrime[0]) \|
				951	(small[1] ^ kPrime[1]) \|
				952	(small[2] ^ kPrime[2]) \| (small[3] ^ kPrime[3]);
				953	is_p--;
				954	is_p &= is_p << 32;
				955	is_p &= is_p << 16;
				956	is_p &= is_p << 8;
				957	is_p &= is_p << 4;
				958	is_p &= is_p << 2;
				959	is_p &= is_p << 1;
				960	is_p = 0 - (is_p >> 63);
				961
				962	is_zero \|= is_p;
				963
				964	result = is_zero;
				965	result \|= ((limb) is_zero) << 64;
				966	return result;
				967	}
				968
				969	static int smallfelem_is_zero_int(const void *small)
				970	{
				971	return (int)(smallfelem_is_zero(small) & ((limb) 1));
				972	}
				973
				974	/*-
				975	* felem_inv calculates \|out\| = \|in\|^{-1}
				976	*
				977	* Based on Fermat's Little Theorem:
				978	* a^p = a (mod p)
				979	* a^{p-1} = 1 (mod p)
				980	* a^{p-2} = a^{-1} (mod p)
				981	*/
				982	static void felem_inv(felem out, const felem in)
				983	{
				984	felem ftmp, ftmp2;
				985	/* each e_I will hold \|in\|^{2^I - 1} */
				986	felem e2, e4, e8, e16, e32, e64;
				987	longfelem tmp;
				988	unsigned i;
				989
				990	felem_square(tmp, in);
				991	felem_reduce(ftmp, tmp); /* 2^1 */
				992	felem_mul(tmp, in, ftmp);
				993	felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
				994	felem_assign(e2, ftmp);
				995	felem_square(tmp, ftmp);
				996	felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
				997	felem_square(tmp, ftmp);
				998	felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */
				999	felem_mul(tmp, ftmp, e2);
				1000	felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */
				1001	felem_assign(e4, ftmp);
				1002	felem_square(tmp, ftmp);
				1003	felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */
				1004	felem_square(tmp, ftmp);
				1005	felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */
				1006	felem_square(tmp, ftmp);
				1007	felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */
				1008	felem_square(tmp, ftmp);
				1009	felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */
				1010	felem_mul(tmp, ftmp, e4);
				1011	felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */
				1012	felem_assign(e8, ftmp);
				1013	for (i = 0; i < 8; i++) {
				1014	felem_square(tmp, ftmp);
				1015	felem_reduce(ftmp, tmp);
				1016	} /* 2^16 - 2^8 */
				1017	felem_mul(tmp, ftmp, e8);
				1018	felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */
				1019	felem_assign(e16, ftmp);
				1020	for (i = 0; i < 16; i++) {
				1021	felem_square(tmp, ftmp);
				1022	felem_reduce(ftmp, tmp);
				1023	} /* 2^32 - 2^16 */
				1024	felem_mul(tmp, ftmp, e16);
				1025	felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */
				1026	felem_assign(e32, ftmp);
				1027	for (i = 0; i < 32; i++) {
				1028	felem_square(tmp, ftmp);
				1029	felem_reduce(ftmp, tmp);
				1030	} /* 2^64 - 2^32 */
				1031	felem_assign(e64, ftmp);
				1032	felem_mul(tmp, ftmp, in);
				1033	felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */
				1034	for (i = 0; i < 192; i++) {
				1035	felem_square(tmp, ftmp);
				1036	felem_reduce(ftmp, tmp);
				1037	} /* 2^256 - 2^224 + 2^192 */
				1038
				1039	felem_mul(tmp, e64, e32);
				1040	felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */
				1041	for (i = 0; i < 16; i++) {
				1042	felem_square(tmp, ftmp2);
				1043	felem_reduce(ftmp2, tmp);
				1044	} /* 2^80 - 2^16 */
				1045	felem_mul(tmp, ftmp2, e16);
				1046	felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */
				1047	for (i = 0; i < 8; i++) {
				1048	felem_square(tmp, ftmp2);
				1049	felem_reduce(ftmp2, tmp);
				1050	} /* 2^88 - 2^8 */
				1051	felem_mul(tmp, ftmp2, e8);
				1052	felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */
				1053	for (i = 0; i < 4; i++) {
				1054	felem_square(tmp, ftmp2);
				1055	felem_reduce(ftmp2, tmp);
				1056	} /* 2^92 - 2^4 */
				1057	felem_mul(tmp, ftmp2, e4);
				1058	felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */
				1059	felem_square(tmp, ftmp2);
				1060	felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */
				1061	felem_square(tmp, ftmp2);
				1062	felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */
				1063	felem_mul(tmp, ftmp2, e2);
				1064	felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */
				1065	felem_square(tmp, ftmp2);
				1066	felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */
				1067	felem_square(tmp, ftmp2);
				1068	felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */
				1069	felem_mul(tmp, ftmp2, in);
				1070	felem_reduce(ftmp2, tmp); /* 2^96 - 3 */
				1071
				1072	felem_mul(tmp, ftmp2, ftmp);
				1073	felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
				1074	}
				1075
				1076	static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
				1077	{
				1078	felem tmp;
				1079
				1080	smallfelem_expand(tmp, in);
				1081	felem_inv(tmp, tmp);
				1082	felem_contract(out, tmp);
				1083	}
				1084
				1085	/*-
				1086	* Group operations
				1087	* ----------------
				1088	*
				1089	* Building on top of the field operations we have the operations on the
				1090	* elliptic curve group itself. Points on the curve are represented in Jacobian
				1091	* coordinates
				1092	*/
				1093
				1094	/*-
				1095	* point_double calculates 2*(x_in, y_in, z_in)
				1096	*
				1097	* The method is taken from:
				1098	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
				1099	*
				1100	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
				1101	* while x_out == y_in is not (maybe this works, but it's not tested).
				1102	*/
				1103	static void
				1104	point_double(felem x_out, felem y_out, felem z_out,
				1105	const felem x_in, const felem y_in, const felem z_in)
				1106	{
				1107	longfelem tmp, tmp2;
				1108	felem delta, gamma, beta, alpha, ftmp, ftmp2;
				1109	smallfelem small1, small2;
				1110
				1111	felem_assign(ftmp, x_in);
				1112	/* ftmp[i] < 2^106 */
				1113	felem_assign(ftmp2, x_in);
				1114	/* ftmp2[i] < 2^106 */
				1115
				1116	/* delta = z^2 */
				1117	felem_square(tmp, z_in);
				1118	felem_reduce(delta, tmp);
				1119	/* delta[i] < 2^101 */
				1120
				1121	/* gamma = y^2 */
				1122	felem_square(tmp, y_in);
				1123	felem_reduce(gamma, tmp);
				1124	/* gamma[i] < 2^101 */
				1125	felem_shrink(small1, gamma);
				1126
				1127	/* beta = xgamma /
				1128	felem_small_mul(tmp, small1, x_in);
				1129	felem_reduce(beta, tmp);
				1130	/* beta[i] < 2^101 */
				1131
				1132	/* alpha = 3(x-delta)(x+delta) */
				1133	felem_diff(ftmp, delta);
				1134	/* ftmp[i] < 2^105 + 2^106 < 2^107 */
				1135	felem_sum(ftmp2, delta);
				1136	/* ftmp2[i] < 2^105 + 2^106 < 2^107 */
				1137	felem_scalar(ftmp2, 3);
				1138	/* ftmp2[i] < 3 * 2^107 < 2^109 */
				1139	felem_mul(tmp, ftmp, ftmp2);
				1140	felem_reduce(alpha, tmp);
				1141	/* alpha[i] < 2^101 */
				1142	felem_shrink(small2, alpha);
				1143
				1144	/* x' = alpha^2 - 8beta /
				1145	smallfelem_square(tmp, small2);
				1146	felem_reduce(x_out, tmp);
				1147	felem_assign(ftmp, beta);
				1148	felem_scalar(ftmp, 8);
				1149	/* ftmp[i] < 8 * 2^101 = 2^104 */
				1150	felem_diff(x_out, ftmp);
				1151	/* x_out[i] < 2^105 + 2^101 < 2^106 */
				1152
				1153	/* z' = (y + z)^2 - gamma - delta */
				1154	felem_sum(delta, gamma);
				1155	/* delta[i] < 2^101 + 2^101 = 2^102 */
				1156	felem_assign(ftmp, y_in);
				1157	felem_sum(ftmp, z_in);
				1158	/* ftmp[i] < 2^106 + 2^106 = 2^107 */
				1159	felem_square(tmp, ftmp);
				1160	felem_reduce(z_out, tmp);
				1161	felem_diff(z_out, delta);
				1162	/* z_out[i] < 2^105 + 2^101 < 2^106 */
				1163
				1164	/* y' = alpha(4beta - x') - 8gamma^2 /
				1165	felem_scalar(beta, 4);
				1166	/* beta[i] < 4 * 2^101 = 2^103 */
				1167	felem_diff_zero107(beta, x_out);
				1168	/* beta[i] < 2^107 + 2^103 < 2^108 */
				1169	felem_small_mul(tmp, small2, beta);
				1170	/* tmp[i] < 7 * 2^64 < 2^67 */
				1171	smallfelem_square(tmp2, small1);
				1172	/* tmp2[i] < 7 * 2^64 */
				1173	longfelem_scalar(tmp2, 8);
				1174	/* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
				1175	longfelem_diff(tmp, tmp2);
				1176	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
				1177	felem_reduce_zero105(y_out, tmp);
				1178	/* y_out[i] < 2^106 */
				1179	}
				1180
				1181	/*
				1182	* point_double_small is the same as point_double, except that it operates on
				1183	* smallfelems
				1184	*/
				1185	static void
				1186	point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
				1187	const smallfelem x_in, const smallfelem y_in,
				1188	const smallfelem z_in)
				1189	{
				1190	felem felem_x_out, felem_y_out, felem_z_out;
				1191	felem felem_x_in, felem_y_in, felem_z_in;
				1192
				1193	smallfelem_expand(felem_x_in, x_in);
				1194	smallfelem_expand(felem_y_in, y_in);
				1195	smallfelem_expand(felem_z_in, z_in);
				1196	point_double(felem_x_out, felem_y_out, felem_z_out,
				1197	felem_x_in, felem_y_in, felem_z_in);
				1198	felem_shrink(x_out, felem_x_out);
				1199	felem_shrink(y_out, felem_y_out);
				1200	felem_shrink(z_out, felem_z_out);
				1201	}
				1202
				1203	/* copy_conditional copies in to out iff mask is all ones. */
				1204	static void copy_conditional(felem out, const felem in, limb mask)
				1205	{
				1206	unsigned i;
				1207	for (i = 0; i < NLIMBS; ++i) {
				1208	const limb tmp = mask & (in[i] ^ out[i]);
				1209	out[i] ^= tmp;
				1210	}
				1211	}
				1212
				1213	/* copy_small_conditional copies in to out iff mask is all ones. */
				1214	static void copy_small_conditional(felem out, const smallfelem in, limb mask)
				1215	{
				1216	unsigned i;
				1217	const u64 mask64 = mask;
				1218	for (i = 0; i < NLIMBS; ++i) {
				1219	out[i] = ((limb) (in[i] & mask64)) \| (out[i] & ~mask);
				1220	}
				1221	}
				1222
				1223	/*-
				1224	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
				1225	*
				1226	* The method is taken from:
				1227	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
				1228	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
				1229	*
				1230	* This function includes a branch for checking whether the two input points
				1231	* are equal, (while not equal to the point at infinity). This case never
				1232	* happens during single point multiplication, so there is no timing leak for
				1233	* ECDH or ECDSA signing.
				1234	*/
				1235	static void point_add(felem x3, felem y3, felem z3,
				1236	const felem x1, const felem y1, const felem z1,
				1237	const int mixed, const smallfelem x2,
				1238	const smallfelem y2, const smallfelem z2)
				1239	{
				1240	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
				1241	longfelem tmp, tmp2;
				1242	smallfelem small1, small2, small3, small4, small5;
				1243	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
				1244	limb points_equal;
				1245
				1246	felem_shrink(small3, z1);
				1247
				1248	z1_is_zero = smallfelem_is_zero(small3);
				1249	z2_is_zero = smallfelem_is_zero(z2);
				1250
				1251	/* ftmp = z1z1 = z1*2 /
				1252	smallfelem_square(tmp, small3);
				1253	felem_reduce(ftmp, tmp);
				1254	/* ftmp[i] < 2^101 */
				1255	felem_shrink(small1, ftmp);
				1256
				1257	if (!mixed) {
				1258	/* ftmp2 = z2z2 = z2*2 /
				1259	smallfelem_square(tmp, z2);
				1260	felem_reduce(ftmp2, tmp);
				1261	/* ftmp2[i] < 2^101 */
				1262	felem_shrink(small2, ftmp2);
				1263
				1264	felem_shrink(small5, x1);
				1265
				1266	/* u1 = ftmp3 = x1z2z2 /
				1267	smallfelem_mul(tmp, small5, small2);
				1268	felem_reduce(ftmp3, tmp);
				1269	/* ftmp3[i] < 2^101 */
				1270
				1271	/* ftmp5 = z1 + z2 */
				1272	felem_assign(ftmp5, z1);
				1273	felem_small_sum(ftmp5, z2);
				1274	/* ftmp5[i] < 2^107 */
				1275
				1276	/* ftmp5 = (z1 + z2)*2 - (z1z1 + z2z2) = 2z1z2 /
				1277	felem_square(tmp, ftmp5);
				1278	felem_reduce(ftmp5, tmp);
				1279	/* ftmp2 = z2z2 + z1z1 */
				1280	felem_sum(ftmp2, ftmp);
				1281	/* ftmp2[i] < 2^101 + 2^101 = 2^102 */
				1282	felem_diff(ftmp5, ftmp2);
				1283	/* ftmp5[i] < 2^105 + 2^101 < 2^106 */
				1284
				1285	/* ftmp2 = z2 * z2z2 */
				1286	smallfelem_mul(tmp, small2, z2);
				1287	felem_reduce(ftmp2, tmp);
				1288
				1289	/* s1 = ftmp2 = y1 * z2*3 /
				1290	felem_mul(tmp, y1, ftmp2);
				1291	felem_reduce(ftmp6, tmp);
				1292	/* ftmp6[i] < 2^101 */
				1293	} else {
				1294	/*
				1295	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
				1296	*/
				1297
				1298	/* u1 = ftmp3 = x1z2z2 /
				1299	felem_assign(ftmp3, x1);
				1300	/* ftmp3[i] < 2^106 */
				1301
				1302	/* ftmp5 = 2z1z2 */
				1303	felem_assign(ftmp5, z1);
				1304	felem_scalar(ftmp5, 2);
				1305	/* ftmp5[i] < 22^106 = 2^107 /
				1306
				1307	/* s1 = ftmp2 = y1 * z2*3 /
				1308	felem_assign(ftmp6, y1);
				1309	/* ftmp6[i] < 2^106 */
				1310	}
				1311
				1312	/* u2 = x2z1z1 /
				1313	smallfelem_mul(tmp, x2, small1);
				1314	felem_reduce(ftmp4, tmp);
				1315
				1316	/* h = ftmp4 = u2 - u1 */
				1317	felem_diff_zero107(ftmp4, ftmp3);
				1318	/* ftmp4[i] < 2^107 + 2^101 < 2^108 */
				1319	felem_shrink(small4, ftmp4);
				1320
				1321	x_equal = smallfelem_is_zero(small4);
				1322
				1323	/* z_out = ftmp5 * h */
				1324	felem_small_mul(tmp, small4, ftmp5);
				1325	felem_reduce(z_out, tmp);
				1326	/* z_out[i] < 2^101 */
				1327
				1328	/* ftmp = z1 * z1z1 */
				1329	smallfelem_mul(tmp, small1, small3);
				1330	felem_reduce(ftmp, tmp);
				1331
				1332	/* s2 = tmp = y2 * z1*3 /
				1333	felem_small_mul(tmp, y2, ftmp);
				1334	felem_reduce(ftmp5, tmp);
				1335
				1336	/* r = ftmp5 = (s2 - s1)2 /
				1337	felem_diff_zero107(ftmp5, ftmp6);
				1338	/* ftmp5[i] < 2^107 + 2^107 = 2^108 */
				1339	felem_scalar(ftmp5, 2);
				1340	/* ftmp5[i] < 2^109 */
				1341	felem_shrink(small1, ftmp5);
				1342	y_equal = smallfelem_is_zero(small1);
				1343
				1344	/*
				1345	* The formulae are incorrect if the points are equal, in affine coordinates
				1346	* (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
				1347	* happens.
				1348	*
				1349	* We use bitwise operations to avoid potential side-channels introduced by
				1350	* the short-circuiting behaviour of boolean operators.
				1351	*
				1352	* The special case of either point being the point at infinity (z1 and/or
				1353	* z2 are zero), is handled separately later on in this function, so we
				1354	* avoid jumping to point_double here in those special cases.
				1355	*/
				1356	points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
				1357
				1358	if (points_equal) {
				1359	/*
				1360	* This is obviously not constant-time but, as mentioned before, this
				1361	* case never happens during single point multiplication, so there is no
				1362	* timing leak for ECDH or ECDSA signing.
				1363	*/
				1364	point_double(x3, y3, z3, x1, y1, z1);
				1365	return;
				1366	}
				1367
				1368	/* I = ftmp = (2h)*2 /
				1369	felem_assign(ftmp, ftmp4);
				1370	felem_scalar(ftmp, 2);
				1371	/* ftmp[i] < 22^108 = 2^109 /
				1372	felem_square(tmp, ftmp);
				1373	felem_reduce(ftmp, tmp);
				1374
				1375	/* J = ftmp2 = h * I */
				1376	felem_mul(tmp, ftmp4, ftmp);
				1377	felem_reduce(ftmp2, tmp);
				1378
				1379	/* V = ftmp4 = U1 * I */
				1380	felem_mul(tmp, ftmp3, ftmp);
				1381	felem_reduce(ftmp4, tmp);
				1382
				1383	/* x_out = r*2 - J - 2V /
				1384	smallfelem_square(tmp, small1);
				1385	felem_reduce(x_out, tmp);
				1386	felem_assign(ftmp3, ftmp4);
				1387	felem_scalar(ftmp4, 2);
				1388	felem_sum(ftmp4, ftmp2);
				1389	/* ftmp4[i] < 22^101 + 2^101 < 2^103 /
				1390	felem_diff(x_out, ftmp4);
				1391	/* x_out[i] < 2^105 + 2^101 */
				1392
				1393	/* y_out = r(V-x_out) - 2 * s1 * J */
				1394	felem_diff_zero107(ftmp3, x_out);
				1395	/* ftmp3[i] < 2^107 + 2^101 < 2^108 */
				1396	felem_small_mul(tmp, small1, ftmp3);
				1397	felem_mul(tmp2, ftmp6, ftmp2);
				1398	longfelem_scalar(tmp2, 2);
				1399	/* tmp2[i] < 22^67 = 2^68 /
				1400	longfelem_diff(tmp, tmp2);
				1401	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
				1402	felem_reduce_zero105(y_out, tmp);
				1403	/* y_out[i] < 2^106 */
				1404
				1405	copy_small_conditional(x_out, x2, z1_is_zero);
				1406	copy_conditional(x_out, x1, z2_is_zero);
				1407	copy_small_conditional(y_out, y2, z1_is_zero);
				1408	copy_conditional(y_out, y1, z2_is_zero);
				1409	copy_small_conditional(z_out, z2, z1_is_zero);
				1410	copy_conditional(z_out, z1, z2_is_zero);
				1411	felem_assign(x3, x_out);
				1412	felem_assign(y3, y_out);
				1413	felem_assign(z3, z_out);
				1414	}
				1415
				1416	/*
				1417	* point_add_small is the same as point_add, except that it operates on
				1418	* smallfelems
				1419	*/
				1420	static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
				1421	smallfelem x1, smallfelem y1, smallfelem z1,
				1422	smallfelem x2, smallfelem y2, smallfelem z2)
				1423	{
				1424	felem felem_x3, felem_y3, felem_z3;
				1425	felem felem_x1, felem_y1, felem_z1;
				1426	smallfelem_expand(felem_x1, x1);
				1427	smallfelem_expand(felem_y1, y1);
				1428	smallfelem_expand(felem_z1, z1);
				1429	point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
				1430	x2, y2, z2);
				1431	felem_shrink(x3, felem_x3);
				1432	felem_shrink(y3, felem_y3);
				1433	felem_shrink(z3, felem_z3);
				1434	}
				1435
				1436	/*-
				1437	* Base point pre computation
				1438	* --------------------------
				1439	*
				1440	* Two different sorts of precomputed tables are used in the following code.
				1441	* Each contain various points on the curve, where each point is three field
				1442	* elements (x, y, z).
				1443	*
				1444	* For the base point table, z is usually 1 (0 for the point at infinity).
				1445	* This table has 2 * 16 elements, starting with the following:
				1446	* index \| bits \| point
				1447	* ------+---------+------------------------------
				1448	* 0 \| 0 0 0 0 \| 0G
				1449	* 1 \| 0 0 0 1 \| 1G
				1450	* 2 \| 0 0 1 0 \| 2^64G
				1451	* 3 \| 0 0 1 1 \| (2^64 + 1)G
				1452	* 4 \| 0 1 0 0 \| 2^128G
				1453	* 5 \| 0 1 0 1 \| (2^128 + 1)G
				1454	* 6 \| 0 1 1 0 \| (2^128 + 2^64)G
				1455	* 7 \| 0 1 1 1 \| (2^128 + 2^64 + 1)G
				1456	* 8 \| 1 0 0 0 \| 2^192G
				1457	* 9 \| 1 0 0 1 \| (2^192 + 1)G
				1458	* 10 \| 1 0 1 0 \| (2^192 + 2^64)G
				1459	* 11 \| 1 0 1 1 \| (2^192 + 2^64 + 1)G
				1460	* 12 \| 1 1 0 0 \| (2^192 + 2^128)G
				1461	* 13 \| 1 1 0 1 \| (2^192 + 2^128 + 1)G
				1462	* 14 \| 1 1 1 0 \| (2^192 + 2^128 + 2^64)G
				1463	* 15 \| 1 1 1 1 \| (2^192 + 2^128 + 2^64 + 1)G
				1464	* followed by a copy of this with each element multiplied by 2^32.
				1465	*
				1466	* The reason for this is so that we can clock bits into four different
				1467	* locations when doing simple scalar multiplies against the base point,
				1468	* and then another four locations using the second 16 elements.
				1469	*
				1470	* Tables for other points have table[i] = iG for i in 0 .. 16. */
				1471
				1472	/* gmul is the table of precomputed base points */
				1473	static const smallfelem gmul[2][16][3] = {
				1474	{{{0, 0, 0, 0},
				1475	{0, 0, 0, 0},
				1476	{0, 0, 0, 0}},
				1477	{{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
				1478	0x6b17d1f2e12c4247},
				1479	{0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
				1480	0x4fe342e2fe1a7f9b},
				1481	{1, 0, 0, 0}},
				1482	{{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
				1483	0x0fa822bc2811aaa5},
				1484	{0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
				1485	0xbff44ae8f5dba80d},
				1486	{1, 0, 0, 0}},
				1487	{{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
				1488	0x300a4bbc89d6726f},
				1489	{0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
				1490	0x72aac7e0d09b4644},
				1491	{1, 0, 0, 0}},
				1492	{{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
				1493	0x447d739beedb5e67},
				1494	{0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
				1495	0x2d4825ab834131ee},
				1496	{1, 0, 0, 0}},
				1497	{{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
				1498	0xef9519328a9c72ff},
				1499	{0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
				1500	0x611e9fc37dbb2c9b},
				1501	{1, 0, 0, 0}},
				1502	{{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
				1503	0x550663797b51f5d8},
				1504	{0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
				1505	0x157164848aecb851},
				1506	{1, 0, 0, 0}},
				1507	{{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
				1508	0xeb5d7745b21141ea},
				1509	{0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
				1510	0xeafd72ebdbecc17b},
				1511	{1, 0, 0, 0}},
				1512	{{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
				1513	0xa6d39677a7849276},
				1514	{0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
				1515	0x674f84749b0b8816},
				1516	{1, 0, 0, 0}},
				1517	{{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
				1518	0x4e769e7672c9ddad},
				1519	{0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
				1520	0x42b99082de830663},
				1521	{1, 0, 0, 0}},
				1522	{{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
				1523	0x78878ef61c6ce04d},
				1524	{0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
				1525	0xb6cb3f5d7b72c321},
				1526	{1, 0, 0, 0}},
				1527	{{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
				1528	0x0c88bc4d716b1287},
				1529	{0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
				1530	0xdd5ddea3f3901dc6},
				1531	{1, 0, 0, 0}},
				1532	{{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
				1533	0x68f344af6b317466},
				1534	{0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
				1535	0x31b9c405f8540a20},
				1536	{1, 0, 0, 0}},
				1537	{{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
				1538	0x4052bf4b6f461db9},
				1539	{0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
				1540	0xfecf4d5190b0fc61},
				1541	{1, 0, 0, 0}},
				1542	{{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
				1543	0x1eddbae2c802e41a},
				1544	{0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
				1545	0x43104d86560ebcfc},
				1546	{1, 0, 0, 0}},
				1547	{{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
				1548	0xb48e26b484f7a21c},
				1549	{0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
				1550	0xfac015404d4d3dab},
				1551	{1, 0, 0, 0}}},
				1552	{{{0, 0, 0, 0},
				1553	{0, 0, 0, 0},
				1554	{0, 0, 0, 0}},
				1555	{{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
				1556	0x7fe36b40af22af89},
				1557	{0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
				1558	0xe697d45825b63624},
				1559	{1, 0, 0, 0}},
				1560	{{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
				1561	0x4a5b506612a677a6},
				1562	{0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
				1563	0xeb13461ceac089f1},
				1564	{1, 0, 0, 0}},
				1565	{{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
				1566	0x0781b8291c6a220a},
				1567	{0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
				1568	0x690cde8df0151593},
				1569	{1, 0, 0, 0}},
				1570	{{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
				1571	0x8a535f566ec73617},
				1572	{0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
				1573	0x0455c08468b08bd7},
				1574	{1, 0, 0, 0}},
				1575	{{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
				1576	0x06bada7ab77f8276},
				1577	{0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
				1578	0x5b476dfd0e6cb18a},
				1579	{1, 0, 0, 0}},
				1580	{{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
				1581	0x3e29864e8a2ec908},
				1582	{0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
				1583	0x239b90ea3dc31e7e},
				1584	{1, 0, 0, 0}},
				1585	{{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
				1586	0x820f4dd949f72ff7},
				1587	{0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
				1588	0x140406ec783a05ec},
				1589	{1, 0, 0, 0}},
				1590	{{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
				1591	0x68f6b8542783dfee},
				1592	{0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
				1593	0xcbe1feba92e40ce6},
				1594	{1, 0, 0, 0}},
				1595	{{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
				1596	0xd0b2f94d2f420109},
				1597	{0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
				1598	0x971459828b0719e5},
				1599	{1, 0, 0, 0}},
				1600	{{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
				1601	0x961610004a866aba},
				1602	{0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
				1603	0x7acb9fadcee75e44},
				1604	{1, 0, 0, 0}},
				1605	{{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
				1606	0x24eb9acca333bf5b},
				1607	{0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
				1608	0x69f891c5acd079cc},
				1609	{1, 0, 0, 0}},
				1610	{{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
				1611	0xe51f547c5972a107},
				1612	{0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
				1613	0x1c309a2b25bb1387},
				1614	{1, 0, 0, 0}},
				1615	{{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
				1616	0x20b87b8aa2c4e503},
				1617	{0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
				1618	0xf5c6fa49919776be},
				1619	{1, 0, 0, 0}},
				1620	{{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
				1621	0x1ed7d1b9332010b9},
				1622	{0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
				1623	0x3a2b03f03217257a},
				1624	{1, 0, 0, 0}},
				1625	{{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
				1626	0x15fee545c78dd9f6},
				1627	{0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
				1628	0x4ab5b6b2b8753f81},
				1629	{1, 0, 0, 0}}}
				1630	};
				1631
				1632	/*
				1633	* select_point selects the \|idx\|th point from a precomputation table and
				1634	* copies it to out.
				1635	*/
				1636	static void select_point(const u64 idx, unsigned int size,
				1637	const smallfelem pre_comp[16][3], smallfelem out[3])
				1638	{
				1639	unsigned i, j;
				1640	u64 *outlimbs = &out[0][0];
				1641
				1642	memset(out, 0, sizeof(out) 3);
				1643
				1644	for (i = 0; i < size; i++) {
				1645	const u64 inlimbs = (u64 )&pre_comp[i][0][0];
				1646	u64 mask = i ^ idx;
				1647	mask \|= mask >> 4;
				1648	mask \|= mask >> 2;
				1649	mask \|= mask >> 1;
				1650	mask &= 1;
				1651	mask--;
				1652	for (j = 0; j < NLIMBS * 3; j++)
				1653	outlimbs[j] \|= inlimbs[j] & mask;
				1654	}
				1655	}
				1656
				1657	/* get_bit returns the \|i\|th bit in \|in\| */
				1658	static char get_bit(const felem_bytearray in, int i)
				1659	{
				1660	if ((i < 0) \|\| (i >= 256))
				1661	return 0;
				1662	return (in[i >> 3] >> (i & 7)) & 1;
				1663	}
				1664
				1665	/*
				1666	* Interleaved point multiplication using precomputed point multiples: The
				1667	* small point multiples 0P, 1P, ..., 17*P are in pre_comp[], the scalars
				1668	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
				1669	* generator, using certain (large) precomputed multiples in g_pre_comp.
				1670	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
				1671	*/
				1672	static void batch_mul(felem x_out, felem y_out, felem z_out,
				1673	const felem_bytearray scalars[],
				1674	const unsigned num_points, const u8 *g_scalar,
				1675	const int mixed, const smallfelem pre_comp[][17][3],
				1676	const smallfelem g_pre_comp[2][16][3])
				1677	{
				1678	int i, skip;
				1679	unsigned num, gen_mul = (g_scalar != NULL);
				1680	felem nq[3], ftmp;
				1681	smallfelem tmp[3];
				1682	u64 bits;
				1683	u8 sign, digit;
				1684
				1685	/* set nq to the point at infinity */
				1686	memset(nq, 0, sizeof(nq));
				1687
				1688	/*
				1689	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
				1690	* of the generator (two in each of the last 32 rounds) and additions of
				1691	* other points multiples (every 5th round).
				1692	*/
				1693	skip = 1; /* save two point operations in the first
				1694	* round */
				1695	for (i = (num_points ? 255 : 31); i >= 0; --i) {
				1696	/* double */
				1697	if (!skip)
				1698	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
				1699
				1700	/* add multiples of the generator */
				1701	if (gen_mul && (i <= 31)) {
				1702	/* first, look 32 bits upwards */
				1703	bits = get_bit(g_scalar, i + 224) << 3;
				1704	bits \|= get_bit(g_scalar, i + 160) << 2;
				1705	bits \|= get_bit(g_scalar, i + 96) << 1;
				1706	bits \|= get_bit(g_scalar, i + 32);
				1707	/* select the point to add, in constant time */
				1708	select_point(bits, 16, g_pre_comp[1], tmp);
				1709
				1710	if (!skip) {
				1711	/* Arg 1 below is for "mixed" */
				1712	point_add(nq[0], nq[1], nq[2],
				1713	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
				1714	} else {
				1715	smallfelem_expand(nq[0], tmp[0]);
				1716	smallfelem_expand(nq[1], tmp[1]);
				1717	smallfelem_expand(nq[2], tmp[2]);
				1718	skip = 0;
				1719	}
				1720
				1721	/* second, look at the current position */
				1722	bits = get_bit(g_scalar, i + 192) << 3;
				1723	bits \|= get_bit(g_scalar, i + 128) << 2;
				1724	bits \|= get_bit(g_scalar, i + 64) << 1;
				1725	bits \|= get_bit(g_scalar, i);
				1726	/* select the point to add, in constant time */
				1727	select_point(bits, 16, g_pre_comp[0], tmp);
				1728	/* Arg 1 below is for "mixed" */
				1729	point_add(nq[0], nq[1], nq[2],
				1730	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
				1731	}
				1732
				1733	/* do other additions every 5 doublings */
				1734	if (num_points && (i % 5 == 0)) {
				1735	/* loop over all scalars */
				1736	for (num = 0; num < num_points; ++num) {
				1737	bits = get_bit(scalars[num], i + 4) << 5;
				1738	bits \|= get_bit(scalars[num], i + 3) << 4;
				1739	bits \|= get_bit(scalars[num], i + 2) << 3;
				1740	bits \|= get_bit(scalars[num], i + 1) << 2;
				1741	bits \|= get_bit(scalars[num], i) << 1;
				1742	bits \|= get_bit(scalars[num], i - 1);
				1743	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
				1744
				1745	/*
				1746	* select the point to add or subtract, in constant time
				1747	*/
				1748	select_point(digit, 17, pre_comp[num], tmp);
				1749	smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
				1750	* point */
				1751	copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
				1752	felem_contract(tmp[1], ftmp);
				1753
				1754	if (!skip) {
				1755	point_add(nq[0], nq[1], nq[2],
				1756	nq[0], nq[1], nq[2],
				1757	mixed, tmp[0], tmp[1], tmp[2]);
				1758	} else {
				1759	smallfelem_expand(nq[0], tmp[0]);
				1760	smallfelem_expand(nq[1], tmp[1]);
				1761	smallfelem_expand(nq[2], tmp[2]);
				1762	skip = 0;
				1763	}
				1764	}
				1765	}
				1766	}
				1767	felem_assign(x_out, nq[0]);
				1768	felem_assign(y_out, nq[1]);
				1769	felem_assign(z_out, nq[2]);
				1770	}
				1771
				1772	/* Precomputation for the group generator. */
				1773	struct nistp256_pre_comp_st {
				1774	smallfelem g_pre_comp[2][16][3];
				1775	CRYPTO_REF_COUNT references;
				1776	CRYPTO_RWLOCK *lock;
				1777	};
				1778
				1779	const EC_METHOD *EC_GFp_nistp256_method(void)
				1780	{
				1781	static const EC_METHOD ret = {
				1782	EC_FLAGS_DEFAULT_OCT,
				1783	NID_X9_62_prime_field,
				1784	ec_GFp_nistp256_group_init,
				1785	ec_GFp_simple_group_finish,
				1786	ec_GFp_simple_group_clear_finish,
				1787	ec_GFp_nist_group_copy,
				1788	ec_GFp_nistp256_group_set_curve,
				1789	ec_GFp_simple_group_get_curve,
				1790	ec_GFp_simple_group_get_degree,
				1791	ec_group_simple_order_bits,
				1792	ec_GFp_simple_group_check_discriminant,
				1793	ec_GFp_simple_point_init,
				1794	ec_GFp_simple_point_finish,
				1795	ec_GFp_simple_point_clear_finish,
				1796	ec_GFp_simple_point_copy,
				1797	ec_GFp_simple_point_set_to_infinity,
				1798	ec_GFp_simple_set_Jprojective_coordinates_GFp,
				1799	ec_GFp_simple_get_Jprojective_coordinates_GFp,
				1800	ec_GFp_simple_point_set_affine_coordinates,
				1801	ec_GFp_nistp256_point_get_affine_coordinates,
				1802	0 /* point_set_compressed_coordinates */ ,
				1803	0 /* point2oct */ ,
				1804	0 /* oct2point */ ,
				1805	ec_GFp_simple_add,
				1806	ec_GFp_simple_dbl,
				1807	ec_GFp_simple_invert,
				1808	ec_GFp_simple_is_at_infinity,
				1809	ec_GFp_simple_is_on_curve,
				1810	ec_GFp_simple_cmp,
				1811	ec_GFp_simple_make_affine,
				1812	ec_GFp_simple_points_make_affine,
				1813	ec_GFp_nistp256_points_mul,
				1814	ec_GFp_nistp256_precompute_mult,
				1815	ec_GFp_nistp256_have_precompute_mult,
				1816	ec_GFp_nist_field_mul,
				1817	ec_GFp_nist_field_sqr,
				1818	0 /* field_div */ ,
				1819	ec_GFp_simple_field_inv,
				1820	0 /* field_encode */ ,
				1821	0 /* field_decode */ ,
				1822	0, /* field_set_to_one */
				1823	ec_key_simple_priv2oct,
				1824	ec_key_simple_oct2priv,
				1825	0, /* set private */
				1826	ec_key_simple_generate_key,
				1827	ec_key_simple_check_key,
				1828	ec_key_simple_generate_public_key,
				1829	0, /* keycopy */
				1830	0, /* keyfinish */
				1831	ecdh_simple_compute_key,
				1832	0, /* field_inverse_mod_ord */
				1833	0, /* blind_coordinates */
				1834	0, /* ladder_pre */
				1835	0, /* ladder_step */
				1836	0 /* ladder_post */
				1837	};
				1838
				1839	return &ret;
				1840	}
				1841
				1842	/******************************************************************************/
				1843	/*
				1844	* FUNCTIONS TO MANAGE PRECOMPUTATION
				1845	*/
				1846
				1847	static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
				1848	{
				1849	NISTP256_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
				1850
				1851	if (ret == NULL) {
				1852	ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
				1853	return ret;
				1854	}
				1855
				1856	ret->references = 1;
				1857
				1858	ret->lock = CRYPTO_THREAD_lock_new();
				1859	if (ret->lock == NULL) {
				1860	ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
				1861	OPENSSL_free(ret);
				1862	return NULL;
				1863	}
				1864	return ret;
				1865	}
				1866
				1867	NISTP256_PRE_COMP EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP p)
				1868	{
				1869	int i;
				1870	if (p != NULL)
				1871	CRYPTO_UP_REF(&p->references, &i, p->lock);
				1872	return p;
				1873	}
				1874
				1875	void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
				1876	{
				1877	int i;
				1878
				1879	if (pre == NULL)
				1880	return;
				1881
				1882	CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
				1883	REF_PRINT_COUNT("EC_nistp256", x);
				1884	if (i > 0)
				1885	return;
				1886	REF_ASSERT_ISNT(i < 0);
				1887
				1888	CRYPTO_THREAD_lock_free(pre->lock);
				1889	OPENSSL_free(pre);
				1890	}
				1891
				1892	/******************************************************************************/
				1893	/*
				1894	* OPENSSL EC_METHOD FUNCTIONS
				1895	*/
				1896
				1897	int ec_GFp_nistp256_group_init(EC_GROUP *group)
				1898	{
				1899	int ret;
				1900	ret = ec_GFp_simple_group_init(group);
				1901	group->a_is_minus3 = 1;
				1902	return ret;
				1903	}
				1904
				1905	int ec_GFp_nistp256_group_set_curve(EC_GROUP group, const BIGNUM p,
				1906	const BIGNUM a, const BIGNUM b,
				1907	BN_CTX *ctx)
				1908	{
				1909	int ret = 0;
				1910	BN_CTX *new_ctx = NULL;
				1911	BIGNUM curve_p, curve_a, *curve_b;
				1912
				1913	if (ctx == NULL)
				1914	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
				1915	return 0;
				1916	BN_CTX_start(ctx);
				1917	curve_p = BN_CTX_get(ctx);
				1918	curve_a = BN_CTX_get(ctx);
				1919	curve_b = BN_CTX_get(ctx);
				1920	if (curve_b == NULL)
				1921	goto err;
				1922	BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
				1923	BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
				1924	BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
				1925	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
				1926	ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
				1927	EC_R_WRONG_CURVE_PARAMETERS);
				1928	goto err;
				1929	}
				1930	group->field_mod_func = BN_nist_mod_256;
				1931	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
				1932	err:
				1933	BN_CTX_end(ctx);
				1934	BN_CTX_free(new_ctx);
				1935	return ret;
				1936	}
				1937
				1938	/*
				1939	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
				1940	* (X/Z^2, Y/Z^3)
				1941	*/
				1942	int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
				1943	const EC_POINT *point,
				1944	BIGNUM x, BIGNUM y,
				1945	BN_CTX *ctx)
				1946	{
				1947	felem z1, z2, x_in, y_in;
				1948	smallfelem x_out, y_out;
				1949	longfelem tmp;
				1950
				1951	if (EC_POINT_is_at_infinity(group, point)) {
				1952	ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
				1953	EC_R_POINT_AT_INFINITY);
				1954	return 0;
				1955	}
				1956	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
				1957	(!BN_to_felem(z1, point->Z)))
				1958	return 0;
				1959	felem_inv(z2, z1);
				1960	felem_square(tmp, z2);
				1961	felem_reduce(z1, tmp);
				1962	felem_mul(tmp, x_in, z1);
				1963	felem_reduce(x_in, tmp);
				1964	felem_contract(x_out, x_in);
				1965	if (x != NULL) {
				1966	if (!smallfelem_to_BN(x, x_out)) {
				1967	ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
				1968	ERR_R_BN_LIB);
				1969	return 0;
				1970	}
				1971	}
				1972	felem_mul(tmp, z1, z2);
				1973	felem_reduce(z1, tmp);
				1974	felem_mul(tmp, y_in, z1);
				1975	felem_reduce(y_in, tmp);
				1976	felem_contract(y_out, y_in);
				1977	if (y != NULL) {
				1978	if (!smallfelem_to_BN(y, y_out)) {
				1979	ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
				1980	ERR_R_BN_LIB);
				1981	return 0;
				1982	}
				1983	}
				1984	return 1;
				1985	}
				1986
				1987	/* points below is of size \|num\|, and tmp_smallfelems is of size \|num+1\| */
				1988	static void make_points_affine(size_t num, smallfelem points[][3],
				1989	smallfelem tmp_smallfelems[])
				1990	{
				1991	/*
				1992	* Runs in constant time, unless an input is the point at infinity (which
				1993	* normally shouldn't happen).
				1994	*/
				1995	ec_GFp_nistp_points_make_affine_internal(num,
				1996	points,
				1997	sizeof(smallfelem),
				1998	tmp_smallfelems,
				1999	(void ()(void ))smallfelem_one,
				2000	smallfelem_is_zero_int,
				2001	(void ()(void , const void *))
				2002	smallfelem_assign,
				2003	(void ()(void , const void *))
				2004	smallfelem_square_contract,
				2005	(void (*)
				2006	(void , const void ,
				2007	const void *))
				2008	smallfelem_mul_contract,
				2009	(void ()(void , const void *))
				2010	smallfelem_inv_contract,
				2011	/* nothing to contract */
				2012	(void ()(void , const void *))
				2013	smallfelem_assign);
				2014	}
				2015
				2016	/*
				2017	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
				2018	* values Result is stored in r (r can equal one of the inputs).
				2019	*/
				2020	int ec_GFp_nistp256_points_mul(const EC_GROUP group, EC_POINT r,
				2021	const BIGNUM *scalar, size_t num,
				2022	const EC_POINT *points[],
				2023	const BIGNUM scalars[], BN_CTX ctx)
				2024	{
				2025	int ret = 0;
				2026	int j;
				2027	int mixed = 0;
				2028	BIGNUM x, y, z, tmp_scalar;
				2029	felem_bytearray g_secret;
				2030	felem_bytearray *secrets = NULL;
				2031	smallfelem (*pre_comp)[17][3] = NULL;
				2032	smallfelem *tmp_smallfelems = NULL;
				2033	unsigned i;
				2034	int num_bytes;
				2035	int have_pre_comp = 0;
				2036	size_t num_points = num;
				2037	smallfelem x_in, y_in, z_in;
				2038	felem x_out, y_out, z_out;
				2039	NISTP256_PRE_COMP *pre = NULL;
				2040	const smallfelem(*g_pre_comp)[16][3] = NULL;
				2041	EC_POINT *generator = NULL;
				2042	const EC_POINT *p = NULL;
				2043	const BIGNUM *p_scalar = NULL;
				2044
				2045	BN_CTX_start(ctx);
				2046	x = BN_CTX_get(ctx);
				2047	y = BN_CTX_get(ctx);
				2048	z = BN_CTX_get(ctx);
				2049	tmp_scalar = BN_CTX_get(ctx);
				2050	if (tmp_scalar == NULL)
				2051	goto err;
				2052
				2053	if (scalar != NULL) {
				2054	pre = group->pre_comp.nistp256;
				2055	if (pre)
				2056	/* we have precomputation, try to use it */
				2057	g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
				2058	else
				2059	/* try to use the standard precomputation */
				2060	g_pre_comp = &gmul[0];
				2061	generator = EC_POINT_new(group);
				2062	if (generator == NULL)
				2063	goto err;
				2064	/* get the generator from precomputation */
				2065	if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) \|\|
				2066	!smallfelem_to_BN(y, g_pre_comp[0][1][1]) \|\|
				2067	!smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
				2068	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
				2069	goto err;
				2070	}
				2071	if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
				2072	generator, x, y, z,
				2073	ctx))
				2074	goto err;
				2075	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
				2076	/* precomputation matches generator */
				2077	have_pre_comp = 1;
				2078	else
				2079	/*
				2080	* we don't have valid precomputation: treat the generator as a
				2081	* random point
				2082	*/
				2083	num_points++;
				2084	}
				2085	if (num_points > 0) {
				2086	if (num_points >= 3) {
				2087	/*
				2088	* unless we precompute multiples for just one or two points,
				2089	* converting those into affine form is time well spent
				2090	*/
				2091	mixed = 1;
				2092	}
				2093	secrets = OPENSSL_malloc(sizeof(secrets) num_points);
				2094	pre_comp = OPENSSL_malloc(sizeof(pre_comp) num_points);
				2095	if (mixed)
				2096	tmp_smallfelems =
				2097	OPENSSL_malloc(sizeof(tmp_smallfelems) (num_points * 17 + 1));
				2098	if ((secrets == NULL) \|\| (pre_comp == NULL)
				2099	\|\| (mixed && (tmp_smallfelems == NULL))) {
				2100	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
				2101	goto err;
				2102	}
				2103
				2104	/*
				2105	* we treat NULL scalars as 0, and NULL points as points at infinity,
				2106	* i.e., they contribute nothing to the linear combination
				2107	*/
				2108	memset(secrets, 0, sizeof(secrets) num_points);
				2109	memset(pre_comp, 0, sizeof(pre_comp) num_points);
				2110	for (i = 0; i < num_points; ++i) {
				2111	if (i == num) {
				2112	/*
				2113	* we didn't have a valid precomputation, so we pick the
				2114	* generator
				2115	*/
				2116	p = EC_GROUP_get0_generator(group);
				2117	p_scalar = scalar;
				2118	} else {
				2119	/* the i^th point */
				2120	p = points[i];
				2121	p_scalar = scalars[i];
				2122	}
				2123	if ((p_scalar != NULL) && (p != NULL)) {
				2124	/* reduce scalar to 0 <= scalar < 2^256 */
				2125	if ((BN_num_bits(p_scalar) > 256)
				2126	\|\| (BN_is_negative(p_scalar))) {
				2127	/*
				2128	* this is an unusual input, and we don't guarantee
				2129	* constant-timeness
				2130	*/
				2131	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
				2132	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
				2133	goto err;
				2134	}
				2135	num_bytes = BN_bn2lebinpad(tmp_scalar,
				2136	secrets[i], sizeof(secrets[i]));
				2137	} else {
				2138	num_bytes = BN_bn2lebinpad(p_scalar,
				2139	secrets[i], sizeof(secrets[i]));
				2140	}
				2141	if (num_bytes < 0) {
				2142	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
				2143	goto err;
				2144	}
				2145	/* precompute multiples */
				2146	if ((!BN_to_felem(x_out, p->X)) \|\|
				2147	(!BN_to_felem(y_out, p->Y)) \|\|
				2148	(!BN_to_felem(z_out, p->Z)))
				2149	goto err;
				2150	felem_shrink(pre_comp[i][1][0], x_out);
				2151	felem_shrink(pre_comp[i][1][1], y_out);
				2152	felem_shrink(pre_comp[i][1][2], z_out);
				2153	for (j = 2; j <= 16; ++j) {
				2154	if (j & 1) {
				2155	point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
				2156	pre_comp[i][j][2], pre_comp[i][1][0],
				2157	pre_comp[i][1][1], pre_comp[i][1][2],
				2158	pre_comp[i][j - 1][0],
				2159	pre_comp[i][j - 1][1],
				2160	pre_comp[i][j - 1][2]);
				2161	} else {
				2162	point_double_small(pre_comp[i][j][0],
				2163	pre_comp[i][j][1],
				2164	pre_comp[i][j][2],
				2165	pre_comp[i][j / 2][0],
				2166	pre_comp[i][j / 2][1],
				2167	pre_comp[i][j / 2][2]);
				2168	}
				2169	}
				2170	}
				2171	}
				2172	if (mixed)
				2173	make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
				2174	}
				2175
				2176	/* the scalar for the generator */
				2177	if ((scalar != NULL) && (have_pre_comp)) {
				2178	memset(g_secret, 0, sizeof(g_secret));
				2179	/* reduce scalar to 0 <= scalar < 2^256 */
				2180	if ((BN_num_bits(scalar) > 256) \|\| (BN_is_negative(scalar))) {
				2181	/*
				2182	* this is an unusual input, and we don't guarantee
				2183	* constant-timeness
				2184	*/
				2185	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
				2186	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
				2187	goto err;
				2188	}
				2189	num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
				2190	} else {
				2191	num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
				2192	}
				2193	/* do the multiplication with generator precomputation */
				2194	batch_mul(x_out, y_out, z_out,
				2195	(const felem_bytearray(*))secrets, num_points,
				2196	g_secret,
				2197	mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
				2198	} else {
				2199	/* do the multiplication without generator precomputation */
				2200	batch_mul(x_out, y_out, z_out,
				2201	(const felem_bytearray(*))secrets, num_points,
				2202	NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
				2203	}
				2204	/* reduce the output to its unique minimal representation */
				2205	felem_contract(x_in, x_out);
				2206	felem_contract(y_in, y_out);
				2207	felem_contract(z_in, z_out);
				2208	if ((!smallfelem_to_BN(x, x_in)) \|\| (!smallfelem_to_BN(y, y_in)) \|\|
				2209	(!smallfelem_to_BN(z, z_in))) {
				2210	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
				2211	goto err;
				2212	}
				2213	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
				2214
				2215	err:
				2216	BN_CTX_end(ctx);
				2217	EC_POINT_free(generator);
				2218	OPENSSL_free(secrets);
				2219	OPENSSL_free(pre_comp);
				2220	OPENSSL_free(tmp_smallfelems);
				2221	return ret;
				2222	}
				2223
				2224	int ec_GFp_nistp256_precompute_mult(EC_GROUP group, BN_CTX ctx)
				2225	{
				2226	int ret = 0;
				2227	NISTP256_PRE_COMP *pre = NULL;
				2228	int i, j;
				2229	BN_CTX *new_ctx = NULL;
				2230	BIGNUM x, y;
				2231	EC_POINT *generator = NULL;
				2232	smallfelem tmp_smallfelems[32];
				2233	felem x_tmp, y_tmp, z_tmp;
				2234
				2235	/* throw away old precomputation */
				2236	EC_pre_comp_free(group);
				2237	if (ctx == NULL)
				2238	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
				2239	return 0;
				2240	BN_CTX_start(ctx);
				2241	x = BN_CTX_get(ctx);
				2242	y = BN_CTX_get(ctx);
				2243	if (y == NULL)
				2244	goto err;
				2245	/* get the generator */
				2246	if (group->generator == NULL)
				2247	goto err;
				2248	generator = EC_POINT_new(group);
				2249	if (generator == NULL)
				2250	goto err;
				2251	BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
				2252	BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
				2253	if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
				2254	goto err;
				2255	if ((pre = nistp256_pre_comp_new()) == NULL)
				2256	goto err;
				2257	/*
				2258	* if the generator is the standard one, use built-in precomputation
				2259	*/
				2260	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
				2261	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
				2262	goto done;
				2263	}
				2264	if ((!BN_to_felem(x_tmp, group->generator->X)) \|\|
				2265	(!BN_to_felem(y_tmp, group->generator->Y)) \|\|
				2266	(!BN_to_felem(z_tmp, group->generator->Z)))
				2267	goto err;
				2268	felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
				2269	felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
				2270	felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
				2271	/*
				2272	* compute 2^64G, 2^128G, 2^192G for the first table, 2^32G, 2^96*G,
				2273	* 2^160G, 2^224G for the second one
				2274	*/
				2275	for (i = 1; i <= 8; i <<= 1) {
				2276	point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
				2277	pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
				2278	pre->g_pre_comp[0][i][1],
				2279	pre->g_pre_comp[0][i][2]);
				2280	for (j = 0; j < 31; ++j) {
				2281	point_double_small(pre->g_pre_comp[1][i][0],
				2282	pre->g_pre_comp[1][i][1],
				2283	pre->g_pre_comp[1][i][2],
				2284	pre->g_pre_comp[1][i][0],
				2285	pre->g_pre_comp[1][i][1],
				2286	pre->g_pre_comp[1][i][2]);
				2287	}
				2288	if (i == 8)
				2289	break;
				2290	point_double_small(pre->g_pre_comp[0][2 * i][0],
				2291	pre->g_pre_comp[0][2 * i][1],
				2292	pre->g_pre_comp[0][2 * i][2],
				2293	pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
				2294	pre->g_pre_comp[1][i][2]);
				2295	for (j = 0; j < 31; ++j) {
				2296	point_double_small(pre->g_pre_comp[0][2 * i][0],
				2297	pre->g_pre_comp[0][2 * i][1],
				2298	pre->g_pre_comp[0][2 * i][2],
				2299	pre->g_pre_comp[0][2 * i][0],
				2300	pre->g_pre_comp[0][2 * i][1],
				2301	pre->g_pre_comp[0][2 * i][2]);
				2302	}
				2303	}
				2304	for (i = 0; i < 2; i++) {
				2305	/* g_pre_comp[i][0] is the point at infinity */
				2306	memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
				2307	/* the remaining multiples */
				2308	/* 2^64G + 2^128G resp. 2^96G + 2^160G */
				2309	point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
				2310	pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
				2311	pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
				2312	pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
				2313	pre->g_pre_comp[i][2][2]);
				2314	/* 2^64G + 2^192G resp. 2^96G + 2^224G */
				2315	point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
				2316	pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
				2317	pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
				2318	pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
				2319	pre->g_pre_comp[i][2][2]);
				2320	/* 2^128G + 2^192G resp. 2^160G + 2^224G */
				2321	point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
				2322	pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
				2323	pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
				2324	pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
				2325	pre->g_pre_comp[i][4][2]);
				2326	/*
				2327	* 2^64G + 2^128G + 2^192G resp. 2^96G + 2^160G + 2^224G
				2328	*/
				2329	point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
				2330	pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
				2331	pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
				2332	pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
				2333	pre->g_pre_comp[i][2][2]);
				2334	for (j = 1; j < 8; ++j) {
				2335	/* odd multiples: add G resp. 2^32G /
				2336	point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
				2337	pre->g_pre_comp[i][2 * j + 1][1],
				2338	pre->g_pre_comp[i][2 * j + 1][2],
				2339	pre->g_pre_comp[i][2 * j][0],
				2340	pre->g_pre_comp[i][2 * j][1],
				2341	pre->g_pre_comp[i][2 * j][2],
				2342	pre->g_pre_comp[i][1][0],
				2343	pre->g_pre_comp[i][1][1],
				2344	pre->g_pre_comp[i][1][2]);
				2345	}
				2346	}
				2347	make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
				2348
				2349	done:
				2350	SETPRECOMP(group, nistp256, pre);
				2351	pre = NULL;
				2352	ret = 1;
				2353
				2354	err:
				2355	BN_CTX_end(ctx);
				2356	EC_POINT_free(generator);
				2357	BN_CTX_free(new_ctx);
				2358	EC_nistp256_pre_comp_free(pre);
				2359	return ret;
				2360	}
				2361
				2362	int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
				2363	{
				2364	return HAVEPRECOMP(group, nistp256);
				2365	}
				2366	#endif