Blame - ap/lib/libssl/openssl-1.1.1o/crypto/modes/asm/ghash-c64xplus.pl - R306

blob: 5826dcbdfe645c55d3cc069a9aecc25445e5021f [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	#! /usr/bin/env perl
				2	# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
				9	#
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	# ====================================================================
				16	#
				17	# December 2011
				18	#
				19	# The module implements GCM GHASH function and underlying single
				20	# multiplication operation in GF(2^128). Even though subroutines
				21	# have _4bit suffix, they are not using any tables, but rely on
				22	# hardware Galois Field Multiply support. Streamed GHASH processes
				23	# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
				24	# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
				25	# comparing apples vs. oranges, but compiler surely could have done
				26	# better, because theoretical [though not necessarily achievable]
				27	# estimate for "4-bit" table-driven implementation is ~12 cycles.
				28
				29	while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
				30	open STDOUT,">$output";
				31
				32	($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
				33
				34	($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
				35	$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
				36	($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
				37	$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
				38	($FF000000,$E10000)=("B30","B31");
				39	($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
				40	$xia="A9";
				41	($rem,$res)=("B4","B5"); # $rem zaps $Htable
				42
				43	$code.=<<___;
				44	.text
				45
				46	.if .ASSEMBLER_VERSION<7000000
				47	.asg 0,__TI_EABI__
				48	.endif
				49	.if __TI_EABI__
				50	.asg gcm_gmult_1bit,_gcm_gmult_1bit
				51	.asg gcm_gmult_4bit,_gcm_gmult_4bit
				52	.asg gcm_ghash_4bit,_gcm_ghash_4bit
				53	.endif
				54
				55	.asg B3,RA
				56
				57	.if 0
				58	.global _gcm_gmult_1bit
				59	_gcm_gmult_1bit:
				60	ADDAD $Htable,2,$Htable
				61	.endif
				62	.global _gcm_gmult_4bit
				63	_gcm_gmult_4bit:
				64	.asmfunc
				65	LDDW *${Htable}[-1],$H1:$H0 ; H.lo
				66	LDDW *${Htable}[-2],$H3:$H2 ; H.hi
				67	\|\| MV $Xip,${xip} ; reassign Xi
				68	\|\| MVK 15,B1 ; SPLOOPD constant
				69
				70	MVK 0xE1,$E10000
				71	\|\| LDBU *++${xip}[15],$x1 ; Xi[15]
				72	MVK 0xFF,$FF000000
				73	\|\| LDBU *--${xip},$x0 ; Xi[14]
				74	SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
				75	SHL $FF000000,24,$FF000000 ; upper byte mask
				76	\|\| BNOP ghash_loop?
				77	\|\| MVK 1,B0 ; take a single spin
				78
				79	PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
				80	AND $H2,$FF000000,$H2u ; H2's upper byte
				81	AND $H3,$FF000000,$H3u ; H3's upper byte
				82	\|\| SHRU $H2u,8,$H2u
				83	SHRU $H3u,8,$H3u
				84	\|\| ZERO $Z1:$Z0
				85	SHRU2 $xia,8,$H01u
				86	\|\| ZERO $Z3:$Z2
				87	.endasmfunc
				88
				89	.global _gcm_ghash_4bit
				90	_gcm_ghash_4bit:
				91	.asmfunc
				92	LDDW *${Htable}[-1],$H1:$H0 ; H.lo
				93	\|\| SHRU $len,4,B0 ; reassign len
				94	LDDW *${Htable}[-2],$H3:$H2 ; H.hi
				95	\|\| MV $Xip,${xip} ; reassign Xi
				96	\|\| MVK 15,B1 ; SPLOOPD constant
				97
				98	MVK 0xE1,$E10000
				99	\|\| [B0] LDNDW *${inp}[1],$H1x:$H0x
				100	MVK 0xFF,$FF000000
				101	\|\| [B0] LDNDW *${inp}++[2],$H3x:$H2x
				102	SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
				103	\|\| LDDW *${xip}[1],$Z1:$Z0
				104	SHL $FF000000,24,$FF000000 ; upper byte mask
				105	\|\| LDDW *${xip}[0],$Z3:$Z2
				106
				107	PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
				108	AND $H2,$FF000000,$H2u ; H2's upper byte
				109	AND $H3,$FF000000,$H3u ; H3's upper byte
				110	\|\| SHRU $H2u,8,$H2u
				111	SHRU $H3u,8,$H3u
				112	SHRU2 $xia,8,$H01u
				113
				114	\|\| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
				115	\|\| [B0] XOR $H1x,$Z1,$Z1
				116	.if .LITTLE_ENDIAN
				117	[B0] XOR $H2x,$Z2,$Z2
				118	\|\| [B0] XOR $H3x,$Z3,$Z3
				119	\|\| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
				120	STDW $Z1:$Z0,*${xip}[1]
				121	\|\| [B0] SHRU $Z1,16,$x0 ; Xi[14]
				122	\|\| [B0] ZERO $Z1:$Z0
				123	.else
				124	[B0] XOR $H2x,$Z2,$Z2
				125	\|\| [B0] XOR $H3x,$Z3,$Z3
				126	\|\| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
				127	STDW $Z1:$Z0,*${xip}[1]
				128	\|\| [B0] SHRU $Z0,8,$x0 ; Xi[14]
				129	\|\| [B0] ZERO $Z1:$Z0
				130	.endif
				131	STDW $Z3:$Z2,*${xip}[0]
				132	\|\| [B0] ZERO $Z3:$Z2
				133	\|\| [B0] MV $xia,$x1
				134	[B0] ADDK 14,${xip}
				135
				136	ghash_loop?:
				137	SPLOOPD 6 ; 6*16+7
				138	\|\| MVC B1,ILC
				139	\|\| [B0] SUB B0,1,B0
				140	\|\| ZERO A0
				141	\|\| ADD $x1,$x1,$xib ; SHL $x1,1,$xib
				142	\|\| SHL $x1,1,$xia
				143	___
				144
				145	########____________________________
				146	# 0 D2. M1 M2 \|
				147	# 1 M1 \|
				148	# 2 M1 M2 \|
				149	# 3 D1. M1 M2 \|
				150	# 4 S1. L1 \|
				151	# 5 S2 S1x L1 D2 L2 \|____________________________
				152	# 6/0 L1 S1 L2 S2x \|D2. M1 M2 \|
				153	# 7/1 L1 S1 D1x S2 M2 \| M1 \|
				154	# 8/2 S1 L1x S2 \| M1 M2 \|
				155	# 9/3 S1 L1x \| D1. M1 M2 \|
				156	# 10/4 D1x \| S1. L1 \|
				157	# 11/5 \|S2 S1x L1 D2 L2 \|____________
				158	# 12/6/0 D1x __\| L1 S1 L2 S2x \|D2. ....
				159	# 7/1 L1 S1 D1x S2 M2 \| ....
				160	# 8/2 S1 L1x S2 \| ....
				161	#####... ................\|............
				162	$code.=<<___;
				163	XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
				164	\|\| XORMPY $H01u,$xib,$H01y
				165	\|\| [A0] LDBU *--${xip},$x0
				166	XORMPY $H1,$xia,$H1x ; 1
				167	XORMPY $H2,$xia,$H2x ; 2
				168	\|\| XORMPY $H2u,$xib,$H2y
				169	XORMPY $H3,$xia,$H3x ; 3
				170	\|\| XORMPY $H3u,$xib,$H3y
				171	\|\|[!A0] MVK.D 15,A0 ; *--${xip} counter
				172	XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
				173	\|\| [A0] SUB.S A0,1,A0
				174	XOR.L $H1x,$Z1,$Z1 ; 5
				175	\|\| AND.D $H01y,$FF000000,$H0z
				176	\|\| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
				177	\|\| SHL $x0,1,$xib
				178	\|\| SHL $x0,1,$xia
				179
				180	XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
				181	\|\| SHL $Z0,1,$rem ; ; rem=Z<<1
				182	\|\| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
				183	\|\| AND.L $H1y,$FF000000,$H1z
				184	XOR.L $H3x,$Z3,$Z3 ; 7/1
				185	\|\| SHRMB.S $Z2,$Z1,$Z1
				186	\|\| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
				187	\|\| AND.S $H2y,$FF000000,$H2z
				188	\|\| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
				189	XOR.L $H1z,$Z1,$Z1 ; 8/2
				190	\|\| SHRMB.S $Z3,$Z2,$Z2
				191	\|\| AND.S $H3y,$FF000000,$H3z
				192	XOR.L $H2z,$Z2,$Z2 ; 9/3
				193	\|\| SHRU $Z3,8,$Z3
				194	XOR.D $H3z,$Z3,$Z3 ; 10/4
				195	NOP ; 11/5
				196
				197	SPKERNEL 0,2
				198	\|\| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
				199
				200	; input pre-fetch is possible where D1 slot is available...
				201	[B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
				202	[B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
				203	NOP ; 10/-
				204	.if .LITTLE_ENDIAN
				205	SWAP2 $Z0,$Z1 ; 11/-
				206	\|\| SWAP4 $Z1,$Z0
				207	SWAP4 $Z1,$Z1 ; 12/-
				208	\|\| SWAP2 $Z0,$Z0
				209	SWAP2 $Z2,$Z3
				210	\|\| SWAP4 $Z3,$Z2
				211	\|\|[!B0] BNOP RA
				212	SWAP4 $Z3,$Z3
				213	\|\| SWAP2 $Z2,$Z2
				214	\|\| [B0] BNOP ghash_loop?
				215	[B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
				216	\|\| [B0] XOR $H1x,$Z1,$Z1
				217	[B0] XOR $H2x,$Z2,$Z2
				218	\|\| [B0] XOR $H3x,$Z3,$Z3
				219	\|\| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
				220	STDW $Z1:$Z0,*${xip}[1]
				221	\|\| [B0] SHRU $Z1,16,$x0 ; Xi[14]
				222	\|\| [B0] ZERO $Z1:$Z0
				223	.else
				224	[!B0] BNOP RA ; 11/-
				225	[B0] BNOP ghash_loop? ; 12/-
				226	[B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
				227	\|\| [B0] XOR $H1x,$Z1,$Z1
				228	[B0] XOR $H2x,$Z2,$Z2
				229	\|\| [B0] XOR $H3x,$Z3,$Z3
				230	\|\| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
				231	STDW $Z1:$Z0,*${xip}[1]
				232	\|\| [B0] SHRU $Z0,8,$x0 ; Xi[14]
				233	\|\| [B0] ZERO $Z1:$Z0
				234	.endif
				235	STDW $Z3:$Z2,*${xip}[0]
				236	\|\| [B0] ZERO $Z3:$Z2
				237	\|\| [B0] MV $xia,$x1
				238	[B0] ADDK 14,${xip}
				239	.endasmfunc
				240
				241	.sect .const
				242	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
				243	.align 4
				244	___
				245
				246	print $code;
				247	close STDOUT or die "error closing STDOUT: $!";