Blame - ap/lib/libssl/openssl-1.1.1o/crypto/bn/bn_asm.c - R306

blob: 4d83a8cf1115de72cafcff85b43aa933f93db7cc [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame^]	1	/*
				2	* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
				3	*
				4	* Licensed under the OpenSSL license (the "License"). You may not use
				5	* this file except in compliance with the License. You can obtain a copy
				6	* in the file LICENSE in the source distribution or at
				7	* https://www.openssl.org/source/license.html
				8	*/
				9
				10	#include <assert.h>
				11	#include <openssl/crypto.h>
				12	#include "internal/cryptlib.h"
				13	#include "bn_local.h"
				14
				15	#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
				16
				17	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
				18	BN_ULONG w)
				19	{
				20	BN_ULONG c1 = 0;
				21
				22	assert(num >= 0);
				23	if (num <= 0)
				24	return c1;
				25
				26	# ifndef OPENSSL_SMALL_FOOTPRINT
				27	while (num & ~3) {
				28	mul_add(rp[0], ap[0], w, c1);
				29	mul_add(rp[1], ap[1], w, c1);
				30	mul_add(rp[2], ap[2], w, c1);
				31	mul_add(rp[3], ap[3], w, c1);
				32	ap += 4;
				33	rp += 4;
				34	num -= 4;
				35	}
				36	# endif
				37	while (num) {
				38	mul_add(rp[0], ap[0], w, c1);
				39	ap++;
				40	rp++;
				41	num--;
				42	}
				43
				44	return c1;
				45	}
				46
				47	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
				48	{
				49	BN_ULONG c1 = 0;
				50
				51	assert(num >= 0);
				52	if (num <= 0)
				53	return c1;
				54
				55	# ifndef OPENSSL_SMALL_FOOTPRINT
				56	while (num & ~3) {
				57	mul(rp[0], ap[0], w, c1);
				58	mul(rp[1], ap[1], w, c1);
				59	mul(rp[2], ap[2], w, c1);
				60	mul(rp[3], ap[3], w, c1);
				61	ap += 4;
				62	rp += 4;
				63	num -= 4;
				64	}
				65	# endif
				66	while (num) {
				67	mul(rp[0], ap[0], w, c1);
				68	ap++;
				69	rp++;
				70	num--;
				71	}
				72	return c1;
				73	}
				74
				75	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
				76	{
				77	assert(n >= 0);
				78	if (n <= 0)
				79	return;
				80
				81	# ifndef OPENSSL_SMALL_FOOTPRINT
				82	while (n & ~3) {
				83	sqr(r[0], r[1], a[0]);
				84	sqr(r[2], r[3], a[1]);
				85	sqr(r[4], r[5], a[2]);
				86	sqr(r[6], r[7], a[3]);
				87	a += 4;
				88	r += 8;
				89	n -= 4;
				90	}
				91	# endif
				92	while (n) {
				93	sqr(r[0], r[1], a[0]);
				94	a++;
				95	r += 2;
				96	n--;
				97	}
				98	}
				99
				100	#else /* !(defined(BN_LLONG) \|\|
				101	* defined(BN_UMULT_HIGH)) */
				102
				103	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
				104	BN_ULONG w)
				105	{
				106	BN_ULONG c = 0;
				107	BN_ULONG bl, bh;
				108
				109	assert(num >= 0);
				110	if (num <= 0)
				111	return (BN_ULONG)0;
				112
				113	bl = LBITS(w);
				114	bh = HBITS(w);
				115
				116	# ifndef OPENSSL_SMALL_FOOTPRINT
				117	while (num & ~3) {
				118	mul_add(rp[0], ap[0], bl, bh, c);
				119	mul_add(rp[1], ap[1], bl, bh, c);
				120	mul_add(rp[2], ap[2], bl, bh, c);
				121	mul_add(rp[3], ap[3], bl, bh, c);
				122	ap += 4;
				123	rp += 4;
				124	num -= 4;
				125	}
				126	# endif
				127	while (num) {
				128	mul_add(rp[0], ap[0], bl, bh, c);
				129	ap++;
				130	rp++;
				131	num--;
				132	}
				133	return c;
				134	}
				135
				136	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
				137	{
				138	BN_ULONG carry = 0;
				139	BN_ULONG bl, bh;
				140
				141	assert(num >= 0);
				142	if (num <= 0)
				143	return (BN_ULONG)0;
				144
				145	bl = LBITS(w);
				146	bh = HBITS(w);
				147
				148	# ifndef OPENSSL_SMALL_FOOTPRINT
				149	while (num & ~3) {
				150	mul(rp[0], ap[0], bl, bh, carry);
				151	mul(rp[1], ap[1], bl, bh, carry);
				152	mul(rp[2], ap[2], bl, bh, carry);
				153	mul(rp[3], ap[3], bl, bh, carry);
				154	ap += 4;
				155	rp += 4;
				156	num -= 4;
				157	}
				158	# endif
				159	while (num) {
				160	mul(rp[0], ap[0], bl, bh, carry);
				161	ap++;
				162	rp++;
				163	num--;
				164	}
				165	return carry;
				166	}
				167
				168	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
				169	{
				170	assert(n >= 0);
				171	if (n <= 0)
				172	return;
				173
				174	# ifndef OPENSSL_SMALL_FOOTPRINT
				175	while (n & ~3) {
				176	sqr64(r[0], r[1], a[0]);
				177	sqr64(r[2], r[3], a[1]);
				178	sqr64(r[4], r[5], a[2]);
				179	sqr64(r[6], r[7], a[3]);
				180	a += 4;
				181	r += 8;
				182	n -= 4;
				183	}
				184	# endif
				185	while (n) {
				186	sqr64(r[0], r[1], a[0]);
				187	a++;
				188	r += 2;
				189	n--;
				190	}
				191	}
				192
				193	#endif /* !(defined(BN_LLONG) \|\|
				194	* defined(BN_UMULT_HIGH)) */
				195
				196	#if defined(BN_LLONG) && defined(BN_DIV2W)
				197
				198	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
				199	{
				200	return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
				201	}
				202
				203	#else
				204
				205	/* Divide h,l by d and return the result. */
				206	/* I need to test this some more :-( */
				207	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
				208	{
				209	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
				210	int i, count = 2;
				211
				212	if (d == 0)
				213	return BN_MASK2;
				214
				215	i = BN_num_bits_word(d);
				216	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
				217
				218	i = BN_BITS2 - i;
				219	if (h >= d)
				220	h -= d;
				221
				222	if (i) {
				223	d <<= i;
				224	h = (h << i) \| (l >> (BN_BITS2 - i));
				225	l <<= i;
				226	}
				227	dh = (d & BN_MASK2h) >> BN_BITS4;
				228	dl = (d & BN_MASK2l);
				229	for (;;) {
				230	if ((h >> BN_BITS4) == dh)
				231	q = BN_MASK2l;
				232	else
				233	q = h / dh;
				234
				235	th = q * dh;
				236	tl = dl * q;
				237	for (;;) {
				238	t = h - th;
				239	if ((t & BN_MASK2h) \|\|
				240	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
				241	break;
				242	q--;
				243	th -= dh;
				244	tl -= dl;
				245	}
				246	t = (tl >> BN_BITS4);
				247	tl = (tl << BN_BITS4) & BN_MASK2h;
				248	th += t;
				249
				250	if (l < tl)
				251	th++;
				252	l -= tl;
				253	if (h < th) {
				254	h += d;
				255	q--;
				256	}
				257	h -= th;
				258
				259	if (--count == 0)
				260	break;
				261
				262	ret = q << BN_BITS4;
				263	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
				264	l = (l & BN_MASK2l) << BN_BITS4;
				265	}
				266	ret \|= q;
				267	return ret;
				268	}
				269	#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
				270
				271	#ifdef BN_LLONG
				272	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
				273	int n)
				274	{
				275	BN_ULLONG ll = 0;
				276
				277	assert(n >= 0);
				278	if (n <= 0)
				279	return (BN_ULONG)0;
				280
				281	# ifndef OPENSSL_SMALL_FOOTPRINT
				282	while (n & ~3) {
				283	ll += (BN_ULLONG) a[0] + b[0];
				284	r[0] = (BN_ULONG)ll & BN_MASK2;
				285	ll >>= BN_BITS2;
				286	ll += (BN_ULLONG) a[1] + b[1];
				287	r[1] = (BN_ULONG)ll & BN_MASK2;
				288	ll >>= BN_BITS2;
				289	ll += (BN_ULLONG) a[2] + b[2];
				290	r[2] = (BN_ULONG)ll & BN_MASK2;
				291	ll >>= BN_BITS2;
				292	ll += (BN_ULLONG) a[3] + b[3];
				293	r[3] = (BN_ULONG)ll & BN_MASK2;
				294	ll >>= BN_BITS2;
				295	a += 4;
				296	b += 4;
				297	r += 4;
				298	n -= 4;
				299	}
				300	# endif
				301	while (n) {
				302	ll += (BN_ULLONG) a[0] + b[0];
				303	r[0] = (BN_ULONG)ll & BN_MASK2;
				304	ll >>= BN_BITS2;
				305	a++;
				306	b++;
				307	r++;
				308	n--;
				309	}
				310	return (BN_ULONG)ll;
				311	}
				312	#else /* !BN_LLONG */
				313	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
				314	int n)
				315	{
				316	BN_ULONG c, l, t;
				317
				318	assert(n >= 0);
				319	if (n <= 0)
				320	return (BN_ULONG)0;
				321
				322	c = 0;
				323	# ifndef OPENSSL_SMALL_FOOTPRINT
				324	while (n & ~3) {
				325	t = a[0];
				326	t = (t + c) & BN_MASK2;
				327	c = (t < c);
				328	l = (t + b[0]) & BN_MASK2;
				329	c += (l < t);
				330	r[0] = l;
				331	t = a[1];
				332	t = (t + c) & BN_MASK2;
				333	c = (t < c);
				334	l = (t + b[1]) & BN_MASK2;
				335	c += (l < t);
				336	r[1] = l;
				337	t = a[2];
				338	t = (t + c) & BN_MASK2;
				339	c = (t < c);
				340	l = (t + b[2]) & BN_MASK2;
				341	c += (l < t);
				342	r[2] = l;
				343	t = a[3];
				344	t = (t + c) & BN_MASK2;
				345	c = (t < c);
				346	l = (t + b[3]) & BN_MASK2;
				347	c += (l < t);
				348	r[3] = l;
				349	a += 4;
				350	b += 4;
				351	r += 4;
				352	n -= 4;
				353	}
				354	# endif
				355	while (n) {
				356	t = a[0];
				357	t = (t + c) & BN_MASK2;
				358	c = (t < c);
				359	l = (t + b[0]) & BN_MASK2;
				360	c += (l < t);
				361	r[0] = l;
				362	a++;
				363	b++;
				364	r++;
				365	n--;
				366	}
				367	return (BN_ULONG)c;
				368	}
				369	#endif /* !BN_LLONG */
				370
				371	BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
				372	int n)
				373	{
				374	BN_ULONG t1, t2;
				375	int c = 0;
				376
				377	assert(n >= 0);
				378	if (n <= 0)
				379	return (BN_ULONG)0;
				380
				381	#ifndef OPENSSL_SMALL_FOOTPRINT
				382	while (n & ~3) {
				383	t1 = a[0];
				384	t2 = b[0];
				385	r[0] = (t1 - t2 - c) & BN_MASK2;
				386	if (t1 != t2)
				387	c = (t1 < t2);
				388	t1 = a[1];
				389	t2 = b[1];
				390	r[1] = (t1 - t2 - c) & BN_MASK2;
				391	if (t1 != t2)
				392	c = (t1 < t2);
				393	t1 = a[2];
				394	t2 = b[2];
				395	r[2] = (t1 - t2 - c) & BN_MASK2;
				396	if (t1 != t2)
				397	c = (t1 < t2);
				398	t1 = a[3];
				399	t2 = b[3];
				400	r[3] = (t1 - t2 - c) & BN_MASK2;
				401	if (t1 != t2)
				402	c = (t1 < t2);
				403	a += 4;
				404	b += 4;
				405	r += 4;
				406	n -= 4;
				407	}
				408	#endif
				409	while (n) {
				410	t1 = a[0];
				411	t2 = b[0];
				412	r[0] = (t1 - t2 - c) & BN_MASK2;
				413	if (t1 != t2)
				414	c = (t1 < t2);
				415	a++;
				416	b++;
				417	r++;
				418	n--;
				419	}
				420	return c;
				421	}
				422
				423	#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
				424
				425	# undef bn_mul_comba8
				426	# undef bn_mul_comba4
				427	# undef bn_sqr_comba8
				428	# undef bn_sqr_comba4
				429
				430	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
				431	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
				432	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
				433	/*
				434	* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
				435	* c=(c2,c1,c0)
				436	*/
				437
				438	# ifdef BN_LLONG
				439	/*
				440	* Keep in mind that additions to multiplication result can not
				441	* overflow, because its high half cannot be all-ones.
				442	*/
				443	# define mul_add_c(a,b,c0,c1,c2) do { \
				444	BN_ULONG hi; \
				445	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
				446	t += c0; /* no carry */ \
				447	c0 = (BN_ULONG)Lw(t); \
				448	hi = (BN_ULONG)Hw(t); \
				449	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				450	} while(0)
				451
				452	# define mul_add_c2(a,b,c0,c1,c2) do { \
				453	BN_ULONG hi; \
				454	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
				455	BN_ULLONG tt = t+c0; /* no carry */ \
				456	c0 = (BN_ULONG)Lw(tt); \
				457	hi = (BN_ULONG)Hw(tt); \
				458	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				459	t += c0; /* no carry */ \
				460	c0 = (BN_ULONG)Lw(t); \
				461	hi = (BN_ULONG)Hw(t); \
				462	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				463	} while(0)
				464
				465	# define sqr_add_c(a,i,c0,c1,c2) do { \
				466	BN_ULONG hi; \
				467	BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
				468	t += c0; /* no carry */ \
				469	c0 = (BN_ULONG)Lw(t); \
				470	hi = (BN_ULONG)Hw(t); \
				471	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				472	} while(0)
				473
				474	# define sqr_add_c2(a,i,j,c0,c1,c2) \
				475	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
				476
				477	# elif defined(BN_UMULT_LOHI)
				478	/*
				479	* Keep in mind that additions to hi can not overflow, because
				480	* the high word of a multiplication result cannot be all-ones.
				481	*/
				482	# define mul_add_c(a,b,c0,c1,c2) do { \
				483	BN_ULONG ta = (a), tb = (b); \
				484	BN_ULONG lo, hi; \
				485	BN_UMULT_LOHI(lo,hi,ta,tb); \
				486	c0 += lo; hi += (c0<lo)?1:0; \
				487	c1 += hi; c2 += (c1<hi)?1:0; \
				488	} while(0)
				489
				490	# define mul_add_c2(a,b,c0,c1,c2) do { \
				491	BN_ULONG ta = (a), tb = (b); \
				492	BN_ULONG lo, hi, tt; \
				493	BN_UMULT_LOHI(lo,hi,ta,tb); \
				494	c0 += lo; tt = hi+((c0<lo)?1:0); \
				495	c1 += tt; c2 += (c1<tt)?1:0; \
				496	c0 += lo; hi += (c0<lo)?1:0; \
				497	c1 += hi; c2 += (c1<hi)?1:0; \
				498	} while(0)
				499
				500	# define sqr_add_c(a,i,c0,c1,c2) do { \
				501	BN_ULONG ta = (a)[i]; \
				502	BN_ULONG lo, hi; \
				503	BN_UMULT_LOHI(lo,hi,ta,ta); \
				504	c0 += lo; hi += (c0<lo)?1:0; \
				505	c1 += hi; c2 += (c1<hi)?1:0; \
				506	} while(0)
				507
				508	# define sqr_add_c2(a,i,j,c0,c1,c2) \
				509	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
				510
				511	# elif defined(BN_UMULT_HIGH)
				512	/*
				513	* Keep in mind that additions to hi can not overflow, because
				514	* the high word of a multiplication result cannot be all-ones.
				515	*/
				516	# define mul_add_c(a,b,c0,c1,c2) do { \
				517	BN_ULONG ta = (a), tb = (b); \
				518	BN_ULONG lo = ta * tb; \
				519	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
				520	c0 += lo; hi += (c0<lo)?1:0; \
				521	c1 += hi; c2 += (c1<hi)?1:0; \
				522	} while(0)
				523
				524	# define mul_add_c2(a,b,c0,c1,c2) do { \
				525	BN_ULONG ta = (a), tb = (b), tt; \
				526	BN_ULONG lo = ta * tb; \
				527	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
				528	c0 += lo; tt = hi + ((c0<lo)?1:0); \
				529	c1 += tt; c2 += (c1<tt)?1:0; \
				530	c0 += lo; hi += (c0<lo)?1:0; \
				531	c1 += hi; c2 += (c1<hi)?1:0; \
				532	} while(0)
				533
				534	# define sqr_add_c(a,i,c0,c1,c2) do { \
				535	BN_ULONG ta = (a)[i]; \
				536	BN_ULONG lo = ta * ta; \
				537	BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
				538	c0 += lo; hi += (c0<lo)?1:0; \
				539	c1 += hi; c2 += (c1<hi)?1:0; \
				540	} while(0)
				541
				542	# define sqr_add_c2(a,i,j,c0,c1,c2) \
				543	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
				544
				545	# else /* !BN_LLONG */
				546	/*
				547	* Keep in mind that additions to hi can not overflow, because
				548	* the high word of a multiplication result cannot be all-ones.
				549	*/
				550	# define mul_add_c(a,b,c0,c1,c2) do { \
				551	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
				552	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
				553	mul64(lo,hi,bl,bh); \
				554	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
				555	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				556	} while(0)
				557
				558	# define mul_add_c2(a,b,c0,c1,c2) do { \
				559	BN_ULONG tt; \
				560	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
				561	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
				562	mul64(lo,hi,bl,bh); \
				563	tt = hi; \
				564	c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
				565	c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
				566	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
				567	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				568	} while(0)
				569
				570	# define sqr_add_c(a,i,c0,c1,c2) do { \
				571	BN_ULONG lo, hi; \
				572	sqr64(lo,hi,(a)[i]); \
				573	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
				574	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
				575	} while(0)
				576
				577	# define sqr_add_c2(a,i,j,c0,c1,c2) \
				578	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
				579	# endif /* !BN_LLONG */
				580
				581	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				582	{
				583	BN_ULONG c1, c2, c3;
				584
				585	c1 = 0;
				586	c2 = 0;
				587	c3 = 0;
				588	mul_add_c(a[0], b[0], c1, c2, c3);
				589	r[0] = c1;
				590	c1 = 0;
				591	mul_add_c(a[0], b[1], c2, c3, c1);
				592	mul_add_c(a[1], b[0], c2, c3, c1);
				593	r[1] = c2;
				594	c2 = 0;
				595	mul_add_c(a[2], b[0], c3, c1, c2);
				596	mul_add_c(a[1], b[1], c3, c1, c2);
				597	mul_add_c(a[0], b[2], c3, c1, c2);
				598	r[2] = c3;
				599	c3 = 0;
				600	mul_add_c(a[0], b[3], c1, c2, c3);
				601	mul_add_c(a[1], b[2], c1, c2, c3);
				602	mul_add_c(a[2], b[1], c1, c2, c3);
				603	mul_add_c(a[3], b[0], c1, c2, c3);
				604	r[3] = c1;
				605	c1 = 0;
				606	mul_add_c(a[4], b[0], c2, c3, c1);
				607	mul_add_c(a[3], b[1], c2, c3, c1);
				608	mul_add_c(a[2], b[2], c2, c3, c1);
				609	mul_add_c(a[1], b[3], c2, c3, c1);
				610	mul_add_c(a[0], b[4], c2, c3, c1);
				611	r[4] = c2;
				612	c2 = 0;
				613	mul_add_c(a[0], b[5], c3, c1, c2);
				614	mul_add_c(a[1], b[4], c3, c1, c2);
				615	mul_add_c(a[2], b[3], c3, c1, c2);
				616	mul_add_c(a[3], b[2], c3, c1, c2);
				617	mul_add_c(a[4], b[1], c3, c1, c2);
				618	mul_add_c(a[5], b[0], c3, c1, c2);
				619	r[5] = c3;
				620	c3 = 0;
				621	mul_add_c(a[6], b[0], c1, c2, c3);
				622	mul_add_c(a[5], b[1], c1, c2, c3);
				623	mul_add_c(a[4], b[2], c1, c2, c3);
				624	mul_add_c(a[3], b[3], c1, c2, c3);
				625	mul_add_c(a[2], b[4], c1, c2, c3);
				626	mul_add_c(a[1], b[5], c1, c2, c3);
				627	mul_add_c(a[0], b[6], c1, c2, c3);
				628	r[6] = c1;
				629	c1 = 0;
				630	mul_add_c(a[0], b[7], c2, c3, c1);
				631	mul_add_c(a[1], b[6], c2, c3, c1);
				632	mul_add_c(a[2], b[5], c2, c3, c1);
				633	mul_add_c(a[3], b[4], c2, c3, c1);
				634	mul_add_c(a[4], b[3], c2, c3, c1);
				635	mul_add_c(a[5], b[2], c2, c3, c1);
				636	mul_add_c(a[6], b[1], c2, c3, c1);
				637	mul_add_c(a[7], b[0], c2, c3, c1);
				638	r[7] = c2;
				639	c2 = 0;
				640	mul_add_c(a[7], b[1], c3, c1, c2);
				641	mul_add_c(a[6], b[2], c3, c1, c2);
				642	mul_add_c(a[5], b[3], c3, c1, c2);
				643	mul_add_c(a[4], b[4], c3, c1, c2);
				644	mul_add_c(a[3], b[5], c3, c1, c2);
				645	mul_add_c(a[2], b[6], c3, c1, c2);
				646	mul_add_c(a[1], b[7], c3, c1, c2);
				647	r[8] = c3;
				648	c3 = 0;
				649	mul_add_c(a[2], b[7], c1, c2, c3);
				650	mul_add_c(a[3], b[6], c1, c2, c3);
				651	mul_add_c(a[4], b[5], c1, c2, c3);
				652	mul_add_c(a[5], b[4], c1, c2, c3);
				653	mul_add_c(a[6], b[3], c1, c2, c3);
				654	mul_add_c(a[7], b[2], c1, c2, c3);
				655	r[9] = c1;
				656	c1 = 0;
				657	mul_add_c(a[7], b[3], c2, c3, c1);
				658	mul_add_c(a[6], b[4], c2, c3, c1);
				659	mul_add_c(a[5], b[5], c2, c3, c1);
				660	mul_add_c(a[4], b[6], c2, c3, c1);
				661	mul_add_c(a[3], b[7], c2, c3, c1);
				662	r[10] = c2;
				663	c2 = 0;
				664	mul_add_c(a[4], b[7], c3, c1, c2);
				665	mul_add_c(a[5], b[6], c3, c1, c2);
				666	mul_add_c(a[6], b[5], c3, c1, c2);
				667	mul_add_c(a[7], b[4], c3, c1, c2);
				668	r[11] = c3;
				669	c3 = 0;
				670	mul_add_c(a[7], b[5], c1, c2, c3);
				671	mul_add_c(a[6], b[6], c1, c2, c3);
				672	mul_add_c(a[5], b[7], c1, c2, c3);
				673	r[12] = c1;
				674	c1 = 0;
				675	mul_add_c(a[6], b[7], c2, c3, c1);
				676	mul_add_c(a[7], b[6], c2, c3, c1);
				677	r[13] = c2;
				678	c2 = 0;
				679	mul_add_c(a[7], b[7], c3, c1, c2);
				680	r[14] = c3;
				681	r[15] = c1;
				682	}
				683
				684	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				685	{
				686	BN_ULONG c1, c2, c3;
				687
				688	c1 = 0;
				689	c2 = 0;
				690	c3 = 0;
				691	mul_add_c(a[0], b[0], c1, c2, c3);
				692	r[0] = c1;
				693	c1 = 0;
				694	mul_add_c(a[0], b[1], c2, c3, c1);
				695	mul_add_c(a[1], b[0], c2, c3, c1);
				696	r[1] = c2;
				697	c2 = 0;
				698	mul_add_c(a[2], b[0], c3, c1, c2);
				699	mul_add_c(a[1], b[1], c3, c1, c2);
				700	mul_add_c(a[0], b[2], c3, c1, c2);
				701	r[2] = c3;
				702	c3 = 0;
				703	mul_add_c(a[0], b[3], c1, c2, c3);
				704	mul_add_c(a[1], b[2], c1, c2, c3);
				705	mul_add_c(a[2], b[1], c1, c2, c3);
				706	mul_add_c(a[3], b[0], c1, c2, c3);
				707	r[3] = c1;
				708	c1 = 0;
				709	mul_add_c(a[3], b[1], c2, c3, c1);
				710	mul_add_c(a[2], b[2], c2, c3, c1);
				711	mul_add_c(a[1], b[3], c2, c3, c1);
				712	r[4] = c2;
				713	c2 = 0;
				714	mul_add_c(a[2], b[3], c3, c1, c2);
				715	mul_add_c(a[3], b[2], c3, c1, c2);
				716	r[5] = c3;
				717	c3 = 0;
				718	mul_add_c(a[3], b[3], c1, c2, c3);
				719	r[6] = c1;
				720	r[7] = c2;
				721	}
				722
				723	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
				724	{
				725	BN_ULONG c1, c2, c3;
				726
				727	c1 = 0;
				728	c2 = 0;
				729	c3 = 0;
				730	sqr_add_c(a, 0, c1, c2, c3);
				731	r[0] = c1;
				732	c1 = 0;
				733	sqr_add_c2(a, 1, 0, c2, c3, c1);
				734	r[1] = c2;
				735	c2 = 0;
				736	sqr_add_c(a, 1, c3, c1, c2);
				737	sqr_add_c2(a, 2, 0, c3, c1, c2);
				738	r[2] = c3;
				739	c3 = 0;
				740	sqr_add_c2(a, 3, 0, c1, c2, c3);
				741	sqr_add_c2(a, 2, 1, c1, c2, c3);
				742	r[3] = c1;
				743	c1 = 0;
				744	sqr_add_c(a, 2, c2, c3, c1);
				745	sqr_add_c2(a, 3, 1, c2, c3, c1);
				746	sqr_add_c2(a, 4, 0, c2, c3, c1);
				747	r[4] = c2;
				748	c2 = 0;
				749	sqr_add_c2(a, 5, 0, c3, c1, c2);
				750	sqr_add_c2(a, 4, 1, c3, c1, c2);
				751	sqr_add_c2(a, 3, 2, c3, c1, c2);
				752	r[5] = c3;
				753	c3 = 0;
				754	sqr_add_c(a, 3, c1, c2, c3);
				755	sqr_add_c2(a, 4, 2, c1, c2, c3);
				756	sqr_add_c2(a, 5, 1, c1, c2, c3);
				757	sqr_add_c2(a, 6, 0, c1, c2, c3);
				758	r[6] = c1;
				759	c1 = 0;
				760	sqr_add_c2(a, 7, 0, c2, c3, c1);
				761	sqr_add_c2(a, 6, 1, c2, c3, c1);
				762	sqr_add_c2(a, 5, 2, c2, c3, c1);
				763	sqr_add_c2(a, 4, 3, c2, c3, c1);
				764	r[7] = c2;
				765	c2 = 0;
				766	sqr_add_c(a, 4, c3, c1, c2);
				767	sqr_add_c2(a, 5, 3, c3, c1, c2);
				768	sqr_add_c2(a, 6, 2, c3, c1, c2);
				769	sqr_add_c2(a, 7, 1, c3, c1, c2);
				770	r[8] = c3;
				771	c3 = 0;
				772	sqr_add_c2(a, 7, 2, c1, c2, c3);
				773	sqr_add_c2(a, 6, 3, c1, c2, c3);
				774	sqr_add_c2(a, 5, 4, c1, c2, c3);
				775	r[9] = c1;
				776	c1 = 0;
				777	sqr_add_c(a, 5, c2, c3, c1);
				778	sqr_add_c2(a, 6, 4, c2, c3, c1);
				779	sqr_add_c2(a, 7, 3, c2, c3, c1);
				780	r[10] = c2;
				781	c2 = 0;
				782	sqr_add_c2(a, 7, 4, c3, c1, c2);
				783	sqr_add_c2(a, 6, 5, c3, c1, c2);
				784	r[11] = c3;
				785	c3 = 0;
				786	sqr_add_c(a, 6, c1, c2, c3);
				787	sqr_add_c2(a, 7, 5, c1, c2, c3);
				788	r[12] = c1;
				789	c1 = 0;
				790	sqr_add_c2(a, 7, 6, c2, c3, c1);
				791	r[13] = c2;
				792	c2 = 0;
				793	sqr_add_c(a, 7, c3, c1, c2);
				794	r[14] = c3;
				795	r[15] = c1;
				796	}
				797
				798	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
				799	{
				800	BN_ULONG c1, c2, c3;
				801
				802	c1 = 0;
				803	c2 = 0;
				804	c3 = 0;
				805	sqr_add_c(a, 0, c1, c2, c3);
				806	r[0] = c1;
				807	c1 = 0;
				808	sqr_add_c2(a, 1, 0, c2, c3, c1);
				809	r[1] = c2;
				810	c2 = 0;
				811	sqr_add_c(a, 1, c3, c1, c2);
				812	sqr_add_c2(a, 2, 0, c3, c1, c2);
				813	r[2] = c3;
				814	c3 = 0;
				815	sqr_add_c2(a, 3, 0, c1, c2, c3);
				816	sqr_add_c2(a, 2, 1, c1, c2, c3);
				817	r[3] = c1;
				818	c1 = 0;
				819	sqr_add_c(a, 2, c2, c3, c1);
				820	sqr_add_c2(a, 3, 1, c2, c3, c1);
				821	r[4] = c2;
				822	c2 = 0;
				823	sqr_add_c2(a, 3, 2, c3, c1, c2);
				824	r[5] = c3;
				825	c3 = 0;
				826	sqr_add_c(a, 3, c1, c2, c3);
				827	r[6] = c1;
				828	r[7] = c2;
				829	}
				830
				831	# ifdef OPENSSL_NO_ASM
				832	# ifdef OPENSSL_BN_ASM_MONT
				833	# include <alloca.h>
				834	/*
				835	* This is essentially reference implementation, which may or may not
				836	* result in performance improvement. E.g. on IA-32 this routine was
				837	* observed to give 40% faster rsa1024 private key operations and 10%
				838	* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
				839	* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
				840	* reference implementation, one to be used as starting point for
				841	* platform-specific assembler. Mentioned numbers apply to compiler
				842	* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
				843	* can vary not only from platform to platform, but even for compiler
				844	* versions. Assembler vs. assembler improvement coefficients can
				845	* [and are known to] differ and are to be documented elsewhere.
				846	*/
				847	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				848	const BN_ULONG np, const BN_ULONG n0p, int num)
				849	{
				850	BN_ULONG c0, c1, ml, *tp, n0;
				851	# ifdef mul64
				852	BN_ULONG mh;
				853	# endif
				854	volatile BN_ULONG *vp;
				855	int i = 0, j;
				856
				857	# if 0 /* template for platform-specific
				858	* implementation */
				859	if (ap == bp)
				860	return bn_sqr_mont(rp, ap, np, n0p, num);
				861	# endif
				862	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
				863
				864	n0 = *n0p;
				865
				866	c0 = 0;
				867	ml = bp[0];
				868	# ifdef mul64
				869	mh = HBITS(ml);
				870	ml = LBITS(ml);
				871	for (j = 0; j < num; ++j)
				872	mul(tp[j], ap[j], ml, mh, c0);
				873	# else
				874	for (j = 0; j < num; ++j)
				875	mul(tp[j], ap[j], ml, c0);
				876	# endif
				877
				878	tp[num] = c0;
				879	tp[num + 1] = 0;
				880	goto enter;
				881
				882	for (i = 0; i < num; i++) {
				883	c0 = 0;
				884	ml = bp[i];
				885	# ifdef mul64
				886	mh = HBITS(ml);
				887	ml = LBITS(ml);
				888	for (j = 0; j < num; ++j)
				889	mul_add(tp[j], ap[j], ml, mh, c0);
				890	# else
				891	for (j = 0; j < num; ++j)
				892	mul_add(tp[j], ap[j], ml, c0);
				893	# endif
				894	c1 = (tp[num] + c0) & BN_MASK2;
				895	tp[num] = c1;
				896	tp[num + 1] = (c1 < c0 ? 1 : 0);
				897	enter:
				898	c1 = tp[0];
				899	ml = (c1 * n0) & BN_MASK2;
				900	c0 = 0;
				901	# ifdef mul64
				902	mh = HBITS(ml);
				903	ml = LBITS(ml);
				904	mul_add(c1, np[0], ml, mh, c0);
				905	# else
				906	mul_add(c1, ml, np[0], c0);
				907	# endif
				908	for (j = 1; j < num; j++) {
				909	c1 = tp[j];
				910	# ifdef mul64
				911	mul_add(c1, np[j], ml, mh, c0);
				912	# else
				913	mul_add(c1, ml, np[j], c0);
				914	# endif
				915	tp[j - 1] = c1 & BN_MASK2;
				916	}
				917	c1 = (tp[num] + c0) & BN_MASK2;
				918	tp[num - 1] = c1;
				919	tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
				920	}
				921
				922	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
				923	c0 = bn_sub_words(rp, tp, np, num);
				924	if (tp[num] != 0 \|\| c0 == 0) {
				925	for (i = 0; i < num + 2; i++)
				926	vp[i] = 0;
				927	return 1;
				928	}
				929	}
				930	for (i = 0; i < num; i++)
				931	rp[i] = tp[i], vp[i] = 0;
				932	vp[num] = 0;
				933	vp[num + 1] = 0;
				934	return 1;
				935	}
				936	# else
				937	/*
				938	* Return value of 0 indicates that multiplication/convolution was not
				939	* performed to signal the caller to fall down to alternative/original
				940	* code-path.
				941	*/
				942	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				943	const BN_ULONG np, const BN_ULONG n0, int num)
				944	{
				945	return 0;
				946	}
				947	# endif /* OPENSSL_BN_ASM_MONT */
				948	# endif
				949
				950	#else /* !BN_MUL_COMBA */
				951
				952	/* hmm... is it faster just to do a multiply? */
				953	# undef bn_sqr_comba4
				954	# undef bn_sqr_comba8
				955	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
				956	{
				957	BN_ULONG t[8];
				958	bn_sqr_normal(r, a, 4, t);
				959	}
				960
				961	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
				962	{
				963	BN_ULONG t[16];
				964	bn_sqr_normal(r, a, 8, t);
				965	}
				966
				967	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				968	{
				969	r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
				970	r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
				971	r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
				972	r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
				973	}
				974
				975	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
				976	{
				977	r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
				978	r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
				979	r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
				980	r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
				981	r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
				982	r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
				983	r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
				984	r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
				985	}
				986
				987	# ifdef OPENSSL_NO_ASM
				988	# ifdef OPENSSL_BN_ASM_MONT
				989	# include <alloca.h>
				990	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				991	const BN_ULONG np, const BN_ULONG n0p, int num)
				992	{
				993	BN_ULONG c0, c1, tp, n0 = n0p;
				994	volatile BN_ULONG *vp;
				995	int i = 0, j;
				996
				997	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
				998
				999	for (i = 0; i <= num; i++)
				1000	tp[i] = 0;
				1001
				1002	for (i = 0; i < num; i++) {
				1003	c0 = bn_mul_add_words(tp, ap, num, bp[i]);
				1004	c1 = (tp[num] + c0) & BN_MASK2;
				1005	tp[num] = c1;
				1006	tp[num + 1] = (c1 < c0 ? 1 : 0);
				1007
				1008	c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
				1009	c1 = (tp[num] + c0) & BN_MASK2;
				1010	tp[num] = c1;
				1011	tp[num + 1] += (c1 < c0 ? 1 : 0);
				1012	for (j = 0; j <= num; j++)
				1013	tp[j] = tp[j + 1];
				1014	}
				1015
				1016	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
				1017	c0 = bn_sub_words(rp, tp, np, num);
				1018	if (tp[num] != 0 \|\| c0 == 0) {
				1019	for (i = 0; i < num + 2; i++)
				1020	vp[i] = 0;
				1021	return 1;
				1022	}
				1023	}
				1024	for (i = 0; i < num; i++)
				1025	rp[i] = tp[i], vp[i] = 0;
				1026	vp[num] = 0;
				1027	vp[num + 1] = 0;
				1028	return 1;
				1029	}
				1030	# else
				1031	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				1032	const BN_ULONG np, const BN_ULONG n0, int num)
				1033	{
				1034	return 0;
				1035	}
				1036	# endif /* OPENSSL_BN_ASM_MONT */
				1037	# endif
				1038
				1039	#endif /* !BN_MUL_COMBA */