Blame - ap/lib/libssl/openssl-1.1.1o/crypto/bn/asm/vis3-mont.pl - R306

blob: d797af8745dc150aaccafa60622992b8c3592d4f [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame^]	1	#! /usr/bin/env perl
				2	# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
				9
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	# ====================================================================
				16
				17	# October 2012.
				18	#
				19	# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
				20	# onward. There are three new instructions used here: umulxhi,
				21	# addxc[cc] and initializing store. On T3 RSA private key operations
				22	# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
				23	# lengths. This is without dedicated squaring procedure. On T4
				24	# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
				25	# for reference purposes, because T4 has dedicated Montgomery
				26	# multiplication and squaring instructions that deliver even more.
				27
				28	$output = pop;
				29	open STDOUT,">$output";
				30
				31	$frame = "STACK_FRAME";
				32	$bias = "STACK_BIAS";
				33
				34	$code.=<<___;
				35	#include "sparc_arch.h"
				36
				37	#ifdef __arch64__
				38	.register %g2,#scratch
				39	.register %g3,#scratch
				40	#endif
				41
				42	.section ".text",#alloc,#execinstr
				43	___
				44
				45	($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
				46	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
				47
				48	# int bn_mul_mont(
				49	$rp="%o0"; # BN_ULONG *rp,
				50	$ap="%o1"; # const BN_ULONG *ap,
				51	$bp="%o2"; # const BN_ULONG *bp,
				52	$np="%o3"; # const BN_ULONG *np,
				53	$n0p="%o4"; # const BN_ULONG *n0,
				54	$num="%o5"; # int num); # caller ensures that num is even
				55	# and >=6
				56	$code.=<<___;
				57	.globl bn_mul_mont_vis3
				58	.align 32
				59	bn_mul_mont_vis3:
				60	add %sp, $bias, %g4 ! real top of stack
				61	sll $num, 2, $num ! size in bytes
				62	add $num, 63, %g5
				63	andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
				64	add %g5, %g5, %g1
				65	add %g5, %g1, %g1 ! 3*buffer size
				66	sub %g4, %g1, %g1
				67	andn %g1, 63, %g1 ! align at 64 byte
				68	sub %g1, $frame, %g1 ! new top of stack
				69	sub %g1, %g4, %g1
				70
				71	save %sp, %g1, %sp
				72	___
				73
				74	# +-------------------------------+<----- %sp
				75	# . .
				76	# +-------------------------------+<----- aligned at 64 bytes
				77	# \| __int64 tmp[0] \|
				78	# +-------------------------------+
				79	# . .
				80	# . .
				81	# +-------------------------------+<----- aligned at 64 bytes
				82	# \| __int64 ap[1..0] \| converted ap[]
				83	# +-------------------------------+
				84	# \| __int64 np[1..0] \| converted np[]
				85	# +-------------------------------+
				86	# \| __int64 ap[3..2] \|
				87	# . .
				88	# . .
				89	# +-------------------------------+
				90	($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
				91	($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
				92	($ovf,$i)=($t0,$t1);
				93	$code.=<<___;
				94	ld [$n0p+0], $t0 ! pull n0[0..1] value
				95	add %sp, $bias+$frame, $tp
				96	ld [$n0p+4], $t1
				97	add $tp, %g5, $anp
				98	ld [$bp+0], $t2 ! m0=bp[0]
				99	sllx $t1, 32, $n0
				100	ld [$bp+4], $t3
				101	or $t0, $n0, $n0
				102	add $bp, 8, $bp
				103
				104	ld [$ap+0], $t0 ! ap[0]
				105	sllx $t3, 32, $m0
				106	ld [$ap+4], $t1
				107	or $t2, $m0, $m0
				108
				109	ld [$ap+8], $t2 ! ap[1]
				110	sllx $t1, 32, $aj
				111	ld [$ap+12], $t3
				112	or $t0, $aj, $aj
				113	add $ap, 16, $ap
				114	stx $aj, [$anp] ! converted ap[0]
				115
				116	mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
				117	umulxhi $aj, $m0, $hi0
				118
				119	ld [$np+0], $t0 ! np[0]
				120	sllx $t3, 32, $aj
				121	ld [$np+4], $t1
				122	or $t2, $aj, $aj
				123
				124	ld [$np+8], $t2 ! np[1]
				125	sllx $t1, 32, $nj
				126	ld [$np+12], $t3
				127	or $t0, $nj, $nj
				128	add $np, 16, $np
				129	stx $nj, [$anp+8] ! converted np[0]
				130
				131	mulx $lo0, $n0, $m1 ! "tp[0]"*n0
				132	stx $aj, [$anp+16] ! converted ap[1]
				133
				134	mulx $aj, $m0, $alo ! ap[1]*bp[0]
				135	umulxhi $aj, $m0, $aj ! ahi=aj
				136
				137	mulx $nj, $m1, $lo1 ! np[0]*m1
				138	umulxhi $nj, $m1, $hi1
				139
				140	sllx $t3, 32, $nj
				141	or $t2, $nj, $nj
				142	stx $nj, [$anp+24] ! converted np[1]
				143	add $anp, 32, $anp
				144
				145	addcc $lo0, $lo1, $lo1
				146	addxc %g0, $hi1, $hi1
				147
				148	mulx $nj, $m1, $nlo ! np[1]*m1
				149	umulxhi $nj, $m1, $nj ! nhi=nj
				150
				151	ba .L1st
				152	sub $num, 24, $cnt ! cnt=num-3
				153
				154	.align 16
				155	.L1st:
				156	ld [$ap+0], $t0 ! ap[j]
				157	addcc $alo, $hi0, $lo0
				158	ld [$ap+4], $t1
				159	addxc $aj, %g0, $hi0
				160
				161	sllx $t1, 32, $aj
				162	add $ap, 8, $ap
				163	or $t0, $aj, $aj
				164	stx $aj, [$anp] ! converted ap[j]
				165
				166	ld [$np+0], $t2 ! np[j]
				167	addcc $nlo, $hi1, $lo1
				168	ld [$np+4], $t3
				169	addxc $nj, %g0, $hi1 ! nhi=nj
				170
				171	sllx $t3, 32, $nj
				172	add $np, 8, $np
				173	mulx $aj, $m0, $alo ! ap[j]*bp[0]
				174	or $t2, $nj, $nj
				175	umulxhi $aj, $m0, $aj ! ahi=aj
				176	stx $nj, [$anp+8] ! converted np[j]
				177	add $anp, 16, $anp ! anp++
				178
				179	mulx $nj, $m1, $nlo ! np[j]*m1
				180	addcc $lo0, $lo1, $lo1 ! np[j]m1+ap[j]bp[0]
				181	umulxhi $nj, $m1, $nj ! nhi=nj
				182	addxc %g0, $hi1, $hi1
				183	stx $lo1, [$tp] ! tp[j-1]
				184	add $tp, 8, $tp ! tp++
				185
				186	brnz,pt $cnt, .L1st
				187	sub $cnt, 8, $cnt ! j--
				188	!.L1st
				189	addcc $alo, $hi0, $lo0
				190	addxc $aj, %g0, $hi0 ! ahi=aj
				191
				192	addcc $nlo, $hi1, $lo1
				193	addxc $nj, %g0, $hi1
				194	addcc $lo0, $lo1, $lo1 ! np[j]m1+ap[j]bp[0]
				195	addxc %g0, $hi1, $hi1
				196	stx $lo1, [$tp] ! tp[j-1]
				197	add $tp, 8, $tp
				198
				199	addcc $hi0, $hi1, $hi1
				200	addxc %g0, %g0, $ovf ! upmost overflow bit
				201	stx $hi1, [$tp]
				202	add $tp, 8, $tp
				203
				204	ba .Louter
				205	sub $num, 16, $i ! i=num-2
				206
				207	.align 16
				208	.Louter:
				209	ld [$bp+0], $t2 ! m0=bp[i]
				210	ld [$bp+4], $t3
				211
				212	sub $anp, $num, $anp ! rewind
				213	sub $tp, $num, $tp
				214	sub $anp, $num, $anp
				215
				216	add $bp, 8, $bp
				217	sllx $t3, 32, $m0
				218	ldx [$anp+0], $aj ! ap[0]
				219	or $t2, $m0, $m0
				220	ldx [$anp+8], $nj ! np[0]
				221
				222	mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
				223	ldx [$tp], $tj ! tp[0]
				224	umulxhi $aj, $m0, $hi0
				225	ldx [$anp+16], $aj ! ap[1]
				226	addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
				227	mulx $aj, $m0, $alo ! ap[1]*bp[i]
				228	addxc %g0, $hi0, $hi0
				229	mulx $lo0, $n0, $m1 ! tp[0]*n0
				230	umulxhi $aj, $m0, $aj ! ahi=aj
				231	mulx $nj, $m1, $lo1 ! np[0]*m1
				232	umulxhi $nj, $m1, $hi1
				233	ldx [$anp+24], $nj ! np[1]
				234	add $anp, 32, $anp
				235	addcc $lo1, $lo0, $lo1
				236	mulx $nj, $m1, $nlo ! np[1]*m1
				237	addxc %g0, $hi1, $hi1
				238	umulxhi $nj, $m1, $nj ! nhi=nj
				239
				240	ba .Linner
				241	sub $num, 24, $cnt ! cnt=num-3
				242	.align 16
				243	.Linner:
				244	addcc $alo, $hi0, $lo0
				245	ldx [$tp+8], $tj ! tp[j]
				246	addxc $aj, %g0, $hi0 ! ahi=aj
				247	ldx [$anp+0], $aj ! ap[j]
				248	addcc $nlo, $hi1, $lo1
				249	mulx $aj, $m0, $alo ! ap[j]*bp[i]
				250	addxc $nj, %g0, $hi1 ! nhi=nj
				251	ldx [$anp+8], $nj ! np[j]
				252	add $anp, 16, $anp
				253	umulxhi $aj, $m0, $aj ! ahi=aj
				254	addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
				255	mulx $nj, $m1, $nlo ! np[j]*m1
				256	addxc %g0, $hi0, $hi0
				257	umulxhi $nj, $m1, $nj ! nhi=nj
				258	addcc $lo1, $lo0, $lo1 ! np[j]m1+ap[j]bp[i]+tp[j]
				259	addxc %g0, $hi1, $hi1
				260	stx $lo1, [$tp] ! tp[j-1]
				261	add $tp, 8, $tp
				262	brnz,pt $cnt, .Linner
				263	sub $cnt, 8, $cnt
				264	!.Linner
				265	ldx [$tp+8], $tj ! tp[j]
				266	addcc $alo, $hi0, $lo0
				267	addxc $aj, %g0, $hi0 ! ahi=aj
				268	addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
				269	addxc %g0, $hi0, $hi0
				270
				271	addcc $nlo, $hi1, $lo1
				272	addxc $nj, %g0, $hi1 ! nhi=nj
				273	addcc $lo1, $lo0, $lo1 ! np[j]m1+ap[j]bp[i]+tp[j]
				274	addxc %g0, $hi1, $hi1
				275	stx $lo1, [$tp] ! tp[j-1]
				276
				277	subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
				278	addxccc $hi1, $hi0, $hi1
				279	addxc %g0, %g0, $ovf
				280	stx $hi1, [$tp+8]
				281	add $tp, 16, $tp
				282
				283	brnz,pt $i, .Louter
				284	sub $i, 8, $i
				285
				286	sub $anp, $num, $anp ! rewind
				287	sub $tp, $num, $tp
				288	sub $anp, $num, $anp
				289	ba .Lsub
				290	subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
				291
				292	.align 16
				293	.Lsub:
				294	ldx [$tp], $tj
				295	add $tp, 8, $tp
				296	ldx [$anp+8], $nj
				297	add $anp, 16, $anp
				298	subccc $tj, $nj, $t2 ! tp[j]-np[j]
				299	srlx $tj, 32, $tj
				300	srlx $nj, 32, $nj
				301	subccc $tj, $nj, $t3
				302	add $rp, 8, $rp
				303	st $t2, [$rp-4] ! reverse order
				304	st $t3, [$rp-8]
				305	brnz,pt $cnt, .Lsub
				306	sub $cnt, 8, $cnt
				307
				308	sub $anp, $num, $anp ! rewind
				309	sub $tp, $num, $tp
				310	sub $anp, $num, $anp
				311	sub $rp, $num, $rp
				312
				313	subccc $ovf, %g0, $ovf ! handle upmost overflow bit
				314	ba .Lcopy
				315	sub $num, 8, $cnt
				316
				317	.align 16
				318	.Lcopy: ! conditional copy
				319	ld [$tp+0], $t0
				320	ld [$tp+4], $t1
				321	ld [$rp+0], $t2
				322	ld [$rp+4], $t3
				323	stx %g0, [$tp] ! zap
				324	add $tp, 8, $tp
				325	stx %g0, [$anp] ! zap
				326	stx %g0, [$anp+8]
				327	add $anp, 16, $anp
				328	movcs %icc, $t0, $t2
				329	movcs %icc, $t1, $t3
				330	st $t3, [$rp+0] ! flip order
				331	st $t2, [$rp+4]
				332	add $rp, 8, $rp
				333	brnz $cnt, .Lcopy
				334	sub $cnt, 8, $cnt
				335
				336	mov 1, %o0
				337	ret
				338	restore
				339	.type bn_mul_mont_vis3, #function
				340	.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
				341	.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
				342	.align 4
				343	___
				344
				345	# Purpose of these subroutines is to explicitly encode VIS instructions,
				346	# so that one can compile the module without having to specify VIS
				347	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
				348	# Idea is to reserve for option to produce "universal" binary and let
				349	# programmer detect if current CPU is VIS capable at run-time.
				350	sub unvis3 {
				351	my ($mnemonic,$rs1,$rs2,$rd)=@_;
				352	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
				353	my ($ref,$opf);
				354	my %visopf = ( "addxc" => 0x011,
				355	"addxccc" => 0x013,
				356	"umulxhi" => 0x016 );
				357
				358	$ref = "$mnemonic\t$rs1,$rs2,$rd";
				359
				360	if ($opf=$visopf{$mnemonic}) {
				361	foreach ($rs1,$rs2,$rd) {
				362	return $ref if (!/%([goli])([0-9])/);
				363	$_=$bias{$1}+$2;
				364	}
				365
				366	return sprintf ".word\t0x%08x !%s",
				367	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
				368	$ref;
				369	} else {
				370	return $ref;
				371	}
				372	}
				373
				374	foreach (split("\n",$code)) {
				375	s/\`([^\`]*)\`/eval $1/ge;
				376
				377	s/\b(umulxhi\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s(%[goli][0-7])/
				378	&unvis3($1,$2,$3,$4)
				379	/ge;
				380
				381	print $_,"\n";
				382	}
				383
				384	close STDOUT or die "error closing STDOUT: $!";