Blame - ap/lib/libssl/openssl-1.1.1o/crypto/modes/asm/ghashv8-armx.pl - R306

blob: d0e398b50ae049e67709e88c2ea50fc381f83ad0 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	#! /usr/bin/env perl
				2	# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
				9	#
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	# ====================================================================
				16	#
				17	# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
				18	#
				19	# June 2014
				20	#
				21	# Initial version was developed in tight cooperation with Ard
				22	# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
				23	# Just like aesv8-armx.pl this module supports both AArch32 and
				24	# AArch64 execution modes.
				25	#
				26	# July 2014
				27	#
				28	# Implement 2x aggregated reduction [see ghash-x86.pl for background
				29	# information].
				30	#
				31	# November 2017
				32	#
				33	# AArch64 register bank to "accommodate" 4x aggregated reduction and
				34	# improve performance by 20-70% depending on processor.
				35	#
				36	# Current performance in cycles per processed byte:
				37	#
				38	# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
				39	# Apple A7 0.58 0.92 5.62
				40	# Cortex-A53 0.85 1.01 8.39
				41	# Cortex-A57 0.73 1.17 7.61
				42	# Denver 0.51 0.65 6.02
				43	# Mongoose 0.65 1.10 8.06
				44	# Kryo 0.76 1.16 8.00
				45	#
				46	# (*) presented for reference/comparison purposes;
				47
				48	$flavour = shift;
				49	$output = shift;
				50
				51	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				52	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
				53	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
				54	die "can't locate arm-xlate.pl";
				55
				56	open OUT,"\| \"$^X\" $xlate $flavour $output";
				57	STDOUT=OUT;
				58
				59	$Xi="x0"; # argument block
				60	$Htbl="x1";
				61	$inp="x2";
				62	$len="x3";
				63
				64	$inc="x12";
				65
				66	{
				67	my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
				68	my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
				69
				70	$code=<<___;
				71	#include "arm_arch.h"
				72
				73	#if __ARM_MAX_ARCH__>=7
				74	.text
				75	___
				76	$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
				77	$code.=<<___ if ($flavour !~ /64/);
				78	.fpu neon
				79	.code 32
				80	#undef __thumb2__
				81	___
				82
				83	################################################################################
				84	# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
				85	#
				86	# input: 128-bit H - secret parameter E(K,0^128)
				87	# output: precomputed table filled with degrees of twisted H;
				88	# H is twisted to handle reverse bitness of GHASH;
				89	# only few of 16 slots of Htable[16] are used;
				90	# data is opaque to outside world (which allows to
				91	# optimize the code independently);
				92	#
				93	$code.=<<___;
				94	.global gcm_init_v8
				95	.type gcm_init_v8,%function
				96	.align 4
				97	gcm_init_v8:
				98	vld1.64 {$t1},[x1] @ load input H
				99	vmov.i8 $xC2,#0xe1
				100	vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
				101	vext.8 $IN,$t1,$t1,#8
				102	vshr.u64 $t2,$xC2,#63
				103	vdup.32 $t1,${t1}[1]
				104	vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
				105	vshr.u64 $t2,$IN,#63
				106	vshr.s32 $t1,$t1,#31 @ broadcast carry bit
				107	vand $t2,$t2,$t0
				108	vshl.i64 $IN,$IN,#1
				109	vext.8 $t2,$t2,$t2,#8
				110	vand $t0,$t0,$t1
				111	vorr $IN,$IN,$t2 @ H<<<=1
				112	veor $H,$IN,$t0 @ twisted H
				113	vst1.64 {$H},[x0],#16 @ store Htable[0]
				114
				115	@ calculate H^2
				116	vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
				117	vpmull.p64 $Xl,$H,$H
				118	veor $t0,$t0,$H
				119	vpmull2.p64 $Xh,$H,$H
				120	vpmull.p64 $Xm,$t0,$t0
				121
				122	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				123	veor $t2,$Xl,$Xh
				124	veor $Xm,$Xm,$t1
				125	veor $Xm,$Xm,$t2
				126	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
				127
				128	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				129	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				130	veor $Xl,$Xm,$t2
				131
				132	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
				133	vpmull.p64 $Xl,$Xl,$xC2
				134	veor $t2,$t2,$Xh
				135	veor $H2,$Xl,$t2
				136
				137	vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
				138	veor $t1,$t1,$H2
				139	vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
				140	vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
				141	___
				142	if ($flavour =~ /64/) {
				143	my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
				144
				145	$code.=<<___;
				146	@ calculate H^3 and H^4
				147	vpmull.p64 $Xl,$H, $H2
				148	vpmull.p64 $Yl,$H2,$H2
				149	vpmull2.p64 $Xh,$H, $H2
				150	vpmull2.p64 $Yh,$H2,$H2
				151	vpmull.p64 $Xm,$t0,$t1
				152	vpmull.p64 $Ym,$t1,$t1
				153
				154	vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
				155	vext.8 $t1,$Yl,$Yh,#8
				156	veor $t2,$Xl,$Xh
				157	veor $Xm,$Xm,$t0
				158	veor $t3,$Yl,$Yh
				159	veor $Ym,$Ym,$t1
				160	veor $Xm,$Xm,$t2
				161	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
				162	veor $Ym,$Ym,$t3
				163	vpmull.p64 $t3,$Yl,$xC2
				164
				165	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				166	vmov $Yh#lo,$Ym#hi
				167	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				168	vmov $Ym#hi,$Yl#lo
				169	veor $Xl,$Xm,$t2
				170	veor $Yl,$Ym,$t3
				171
				172	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
				173	vext.8 $t3,$Yl,$Yl,#8
				174	vpmull.p64 $Xl,$Xl,$xC2
				175	vpmull.p64 $Yl,$Yl,$xC2
				176	veor $t2,$t2,$Xh
				177	veor $t3,$t3,$Yh
				178	veor $H, $Xl,$t2 @ H^3
				179	veor $H2,$Yl,$t3 @ H^4
				180
				181	vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
				182	vext.8 $t1,$H2,$H2,#8
				183	veor $t0,$t0,$H
				184	veor $t1,$t1,$H2
				185	vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
				186	vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
				187	___
				188	}
				189	$code.=<<___;
				190	ret
				191	.size gcm_init_v8,.-gcm_init_v8
				192	___
				193	################################################################################
				194	# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
				195	#
				196	# input: Xi - current hash value;
				197	# Htable - table precomputed in gcm_init_v8;
				198	# output: Xi - next hash value Xi;
				199	#
				200	$code.=<<___;
				201	.global gcm_gmult_v8
				202	.type gcm_gmult_v8,%function
				203	.align 4
				204	gcm_gmult_v8:
				205	vld1.64 {$t1},[$Xi] @ load Xi
				206	vmov.i8 $xC2,#0xe1
				207	vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
				208	vshl.u64 $xC2,$xC2,#57
				209	#ifndef __ARMEB__
				210	vrev64.8 $t1,$t1
				211	#endif
				212	vext.8 $IN,$t1,$t1,#8
				213
				214	vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
				215	veor $t1,$t1,$IN @ Karatsuba pre-processing
				216	vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
				217	vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
				218
				219	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				220	veor $t2,$Xl,$Xh
				221	veor $Xm,$Xm,$t1
				222	veor $Xm,$Xm,$t2
				223	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				224
				225	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				226	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				227	veor $Xl,$Xm,$t2
				228
				229	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				230	vpmull.p64 $Xl,$Xl,$xC2
				231	veor $t2,$t2,$Xh
				232	veor $Xl,$Xl,$t2
				233
				234	#ifndef __ARMEB__
				235	vrev64.8 $Xl,$Xl
				236	#endif
				237	vext.8 $Xl,$Xl,$Xl,#8
				238	vst1.64 {$Xl},[$Xi] @ write out Xi
				239
				240	ret
				241	.size gcm_gmult_v8,.-gcm_gmult_v8
				242	___
				243	################################################################################
				244	# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
				245	#
				246	# input: table precomputed in gcm_init_v8;
				247	# current hash value Xi;
				248	# pointer to input data;
				249	# length of input data in bytes, but divisible by block size;
				250	# output: next hash value Xi;
				251	#
				252	$code.=<<___;
				253	.global gcm_ghash_v8
				254	.type gcm_ghash_v8,%function
				255	.align 4
				256	gcm_ghash_v8:
				257	___
				258	$code.=<<___ if ($flavour =~ /64/);
				259	cmp $len,#64
				260	b.hs .Lgcm_ghash_v8_4x
				261	___
				262	$code.=<<___ if ($flavour !~ /64/);
				263	vstmdb sp!,{d8-d15} @ 32-bit ABI says so
				264	___
				265	$code.=<<___;
				266	vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
				267	@ "[rotated]" means that
				268	@ loaded value would have
				269	@ to be rotated in order to
				270	@ make it appear as in
				271	@ algorithm specification
				272	subs $len,$len,#32 @ see if $len is 32 or larger
				273	mov $inc,#16 @ $inc is used as post-
				274	@ increment for input pointer;
				275	@ as loop is modulo-scheduled
				276	@ $inc is zeroed just in time
				277	@ to preclude overstepping
				278	@ inp[len], which means that
				279	@ last block[s] are actually
				280	@ loaded twice, but last
				281	@ copy is not processed
				282	vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
				283	vmov.i8 $xC2,#0xe1
				284	vld1.64 {$H2},[$Htbl]
				285	cclr $inc,eq @ is it time to zero $inc?
				286	vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
				287	vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
				288	vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
				289	#ifndef __ARMEB__
				290	vrev64.8 $t0,$t0
				291	vrev64.8 $Xl,$Xl
				292	#endif
				293	vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
				294	b.lo .Lodd_tail_v8 @ $len was less than 32
				295	___
				296	{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
				297	#######
				298	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
				299	# [(HIi+1) + (HXi+1)] mod P =
				300	# [(HIi+1) + H^2(Ii+Xi)] mod P
				301	#
				302	$code.=<<___;
				303	vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
				304	#ifndef __ARMEB__
				305	vrev64.8 $t1,$t1
				306	#endif
				307	vext.8 $In,$t1,$t1,#8
				308	veor $IN,$IN,$Xl @ I[i]^=Xi
				309	vpmull.p64 $Xln,$H,$In @ H·Ii+1
				310	veor $t1,$t1,$In @ Karatsuba pre-processing
				311	vpmull2.p64 $Xhn,$H,$In
				312	b .Loop_mod2x_v8
				313
				314	.align 4
				315	.Loop_mod2x_v8:
				316	vext.8 $t2,$IN,$IN,#8
				317	subs $len,$len,#32 @ is there more data?
				318	vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
				319	cclr $inc,lo @ is it time to zero $inc?
				320
				321	vpmull.p64 $Xmn,$Hhl,$t1
				322	veor $t2,$t2,$IN @ Karatsuba pre-processing
				323	vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
				324	veor $Xl,$Xl,$Xln @ accumulate
				325	vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
				326	vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
				327
				328	veor $Xh,$Xh,$Xhn
				329	cclr $inc,eq @ is it time to zero $inc?
				330	veor $Xm,$Xm,$Xmn
				331
				332	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				333	veor $t2,$Xl,$Xh
				334	veor $Xm,$Xm,$t1
				335	vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
				336	#ifndef __ARMEB__
				337	vrev64.8 $t0,$t0
				338	#endif
				339	veor $Xm,$Xm,$t2
				340	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				341
				342	#ifndef __ARMEB__
				343	vrev64.8 $t1,$t1
				344	#endif
				345	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				346	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				347	vext.8 $In,$t1,$t1,#8
				348	vext.8 $IN,$t0,$t0,#8
				349	veor $Xl,$Xm,$t2
				350	vpmull.p64 $Xln,$H,$In @ H·Ii+1
				351	veor $IN,$IN,$Xh @ accumulate $IN early
				352
				353	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				354	vpmull.p64 $Xl,$Xl,$xC2
				355	veor $IN,$IN,$t2
				356	veor $t1,$t1,$In @ Karatsuba pre-processing
				357	veor $IN,$IN,$Xl
				358	vpmull2.p64 $Xhn,$H,$In
				359	b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
				360
				361	veor $Xh,$Xh,$t2
				362	vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
				363	adds $len,$len,#32 @ re-construct $len
				364	veor $Xl,$Xl,$Xh @ re-construct $Xl
				365	b.eq .Ldone_v8 @ is $len zero?
				366	___
				367	}
				368	$code.=<<___;
				369	.Lodd_tail_v8:
				370	vext.8 $t2,$Xl,$Xl,#8
				371	veor $IN,$IN,$Xl @ inp^=Xi
				372	veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
				373
				374	vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
				375	veor $t1,$t1,$IN @ Karatsuba pre-processing
				376	vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
				377	vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
				378
				379	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				380	veor $t2,$Xl,$Xh
				381	veor $Xm,$Xm,$t1
				382	veor $Xm,$Xm,$t2
				383	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				384
				385	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				386	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				387	veor $Xl,$Xm,$t2
				388
				389	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				390	vpmull.p64 $Xl,$Xl,$xC2
				391	veor $t2,$t2,$Xh
				392	veor $Xl,$Xl,$t2
				393
				394	.Ldone_v8:
				395	#ifndef __ARMEB__
				396	vrev64.8 $Xl,$Xl
				397	#endif
				398	vext.8 $Xl,$Xl,$Xl,#8
				399	vst1.64 {$Xl},[$Xi] @ write out Xi
				400
				401	___
				402	$code.=<<___ if ($flavour !~ /64/);
				403	vldmia sp!,{d8-d15} @ 32-bit ABI says so
				404	___
				405	$code.=<<___;
				406	ret
				407	.size gcm_ghash_v8,.-gcm_ghash_v8
				408	___
				409
				410	if ($flavour =~ /64/) { # 4x subroutine
				411	my ($I0,$j1,$j2,$j3,
				412	$I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
				413
				414	$code.=<<___;
				415	.type gcm_ghash_v8_4x,%function
				416	.align 4
				417	gcm_ghash_v8_4x:
				418	.Lgcm_ghash_v8_4x:
				419	vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
				420	vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
				421	vmov.i8 $xC2,#0xe1
				422	vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
				423	vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
				424
				425	vld1.64 {$I0-$j3},[$inp],#64
				426	#ifndef __ARMEB__
				427	vrev64.8 $Xl,$Xl
				428	vrev64.8 $j1,$j1
				429	vrev64.8 $j2,$j2
				430	vrev64.8 $j3,$j3
				431	vrev64.8 $I0,$I0
				432	#endif
				433	vext.8 $I3,$j3,$j3,#8
				434	vext.8 $I2,$j2,$j2,#8
				435	vext.8 $I1,$j1,$j1,#8
				436
				437	vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
				438	veor $j3,$j3,$I3
				439	vpmull2.p64 $Yh,$H,$I3
				440	vpmull.p64 $Ym,$Hhl,$j3
				441
				442	vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
				443	veor $j2,$j2,$I2
				444	vpmull2.p64 $I2,$H2,$I2
				445	vpmull2.p64 $j2,$Hhl,$j2
				446
				447	veor $Yl,$Yl,$t0
				448	veor $Yh,$Yh,$I2
				449	veor $Ym,$Ym,$j2
				450
				451	vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
				452	veor $j1,$j1,$I1
				453	vpmull2.p64 $I1,$H3,$I1
				454	vpmull.p64 $j1,$H34,$j1
				455
				456	veor $Yl,$Yl,$j3
				457	veor $Yh,$Yh,$I1
				458	veor $Ym,$Ym,$j1
				459
				460	subs $len,$len,#128
				461	b.lo .Ltail4x
				462
				463	b .Loop4x
				464
				465	.align 4
				466	.Loop4x:
				467	veor $t0,$I0,$Xl
				468	vld1.64 {$I0-$j3},[$inp],#64
				469	vext.8 $IN,$t0,$t0,#8
				470	#ifndef __ARMEB__
				471	vrev64.8 $j1,$j1
				472	vrev64.8 $j2,$j2
				473	vrev64.8 $j3,$j3
				474	vrev64.8 $I0,$I0
				475	#endif
				476
				477	vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
				478	veor $t0,$t0,$IN
				479	vpmull2.p64 $Xh,$H4,$IN
				480	vext.8 $I3,$j3,$j3,#8
				481	vpmull2.p64 $Xm,$H34,$t0
				482
				483	veor $Xl,$Xl,$Yl
				484	veor $Xh,$Xh,$Yh
				485	vext.8 $I2,$j2,$j2,#8
				486	veor $Xm,$Xm,$Ym
				487	vext.8 $I1,$j1,$j1,#8
				488
				489	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				490	veor $t2,$Xl,$Xh
				491	vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
				492	veor $j3,$j3,$I3
				493	veor $Xm,$Xm,$t1
				494	vpmull2.p64 $Yh,$H,$I3
				495	veor $Xm,$Xm,$t2
				496	vpmull.p64 $Ym,$Hhl,$j3
				497
				498	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				499	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				500	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				501	vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
				502	veor $j2,$j2,$I2
				503	vpmull2.p64 $I2,$H2,$I2
				504	veor $Xl,$Xm,$t2
				505	vpmull2.p64 $j2,$Hhl,$j2
				506
				507	veor $Yl,$Yl,$t0
				508	veor $Yh,$Yh,$I2
				509	veor $Ym,$Ym,$j2
				510
				511	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				512	vpmull.p64 $Xl,$Xl,$xC2
				513	vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
				514	veor $j1,$j1,$I1
				515	veor $t2,$t2,$Xh
				516	vpmull2.p64 $I1,$H3,$I1
				517	vpmull.p64 $j1,$H34,$j1
				518
				519	veor $Xl,$Xl,$t2
				520	veor $Yl,$Yl,$j3
				521	veor $Yh,$Yh,$I1
				522	vext.8 $Xl,$Xl,$Xl,#8
				523	veor $Ym,$Ym,$j1
				524
				525	subs $len,$len,#64
				526	b.hs .Loop4x
				527
				528	.Ltail4x:
				529	veor $t0,$I0,$Xl
				530	vext.8 $IN,$t0,$t0,#8
				531
				532	vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
				533	veor $t0,$t0,$IN
				534	vpmull2.p64 $Xh,$H4,$IN
				535	vpmull2.p64 $Xm,$H34,$t0
				536
				537	veor $Xl,$Xl,$Yl
				538	veor $Xh,$Xh,$Yh
				539	veor $Xm,$Xm,$Ym
				540
				541	adds $len,$len,#64
				542	b.eq .Ldone4x
				543
				544	cmp $len,#32
				545	b.lo .Lone
				546	b.eq .Ltwo
				547	.Lthree:
				548	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				549	veor $t2,$Xl,$Xh
				550	veor $Xm,$Xm,$t1
				551	vld1.64 {$I0-$j2},[$inp]
				552	veor $Xm,$Xm,$t2
				553	#ifndef __ARMEB__
				554	vrev64.8 $j1,$j1
				555	vrev64.8 $j2,$j2
				556	vrev64.8 $I0,$I0
				557	#endif
				558
				559	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				560	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				561	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				562	vext.8 $I2,$j2,$j2,#8
				563	vext.8 $I1,$j1,$j1,#8
				564	veor $Xl,$Xm,$t2
				565
				566	vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
				567	veor $j2,$j2,$I2
				568
				569	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				570	vpmull.p64 $Xl,$Xl,$xC2
				571	veor $t2,$t2,$Xh
				572	vpmull2.p64 $Yh,$H,$I2
				573	vpmull.p64 $Ym,$Hhl,$j2
				574	veor $Xl,$Xl,$t2
				575	vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
				576	veor $j1,$j1,$I1
				577	vext.8 $Xl,$Xl,$Xl,#8
				578
				579	vpmull2.p64 $I1,$H2,$I1
				580	veor $t0,$I0,$Xl
				581	vpmull2.p64 $j1,$Hhl,$j1
				582	vext.8 $IN,$t0,$t0,#8
				583
				584	veor $Yl,$Yl,$j3
				585	veor $Yh,$Yh,$I1
				586	veor $Ym,$Ym,$j1
				587
				588	vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
				589	veor $t0,$t0,$IN
				590	vpmull2.p64 $Xh,$H3,$IN
				591	vpmull.p64 $Xm,$H34,$t0
				592
				593	veor $Xl,$Xl,$Yl
				594	veor $Xh,$Xh,$Yh
				595	veor $Xm,$Xm,$Ym
				596	b .Ldone4x
				597
				598	.align 4
				599	.Ltwo:
				600	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				601	veor $t2,$Xl,$Xh
				602	veor $Xm,$Xm,$t1
				603	vld1.64 {$I0-$j1},[$inp]
				604	veor $Xm,$Xm,$t2
				605	#ifndef __ARMEB__
				606	vrev64.8 $j1,$j1
				607	vrev64.8 $I0,$I0
				608	#endif
				609
				610	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				611	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				612	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				613	vext.8 $I1,$j1,$j1,#8
				614	veor $Xl,$Xm,$t2
				615
				616	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				617	vpmull.p64 $Xl,$Xl,$xC2
				618	veor $t2,$t2,$Xh
				619	veor $Xl,$Xl,$t2
				620	vext.8 $Xl,$Xl,$Xl,#8
				621
				622	vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
				623	veor $j1,$j1,$I1
				624
				625	veor $t0,$I0,$Xl
				626	vext.8 $IN,$t0,$t0,#8
				627
				628	vpmull2.p64 $Yh,$H,$I1
				629	vpmull.p64 $Ym,$Hhl,$j1
				630
				631	vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
				632	veor $t0,$t0,$IN
				633	vpmull2.p64 $Xh,$H2,$IN
				634	vpmull2.p64 $Xm,$Hhl,$t0
				635
				636	veor $Xl,$Xl,$Yl
				637	veor $Xh,$Xh,$Yh
				638	veor $Xm,$Xm,$Ym
				639	b .Ldone4x
				640
				641	.align 4
				642	.Lone:
				643	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				644	veor $t2,$Xl,$Xh
				645	veor $Xm,$Xm,$t1
				646	vld1.64 {$I0},[$inp]
				647	veor $Xm,$Xm,$t2
				648	#ifndef __ARMEB__
				649	vrev64.8 $I0,$I0
				650	#endif
				651
				652	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				653	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				654	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				655	veor $Xl,$Xm,$t2
				656
				657	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				658	vpmull.p64 $Xl,$Xl,$xC2
				659	veor $t2,$t2,$Xh
				660	veor $Xl,$Xl,$t2
				661	vext.8 $Xl,$Xl,$Xl,#8
				662
				663	veor $t0,$I0,$Xl
				664	vext.8 $IN,$t0,$t0,#8
				665
				666	vpmull.p64 $Xl,$H,$IN
				667	veor $t0,$t0,$IN
				668	vpmull2.p64 $Xh,$H,$IN
				669	vpmull.p64 $Xm,$Hhl,$t0
				670
				671	.Ldone4x:
				672	vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
				673	veor $t2,$Xl,$Xh
				674	veor $Xm,$Xm,$t1
				675	veor $Xm,$Xm,$t2
				676
				677	vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
				678	vmov $Xh#lo,$Xm#hi @ Xh\|Xm - 256-bit result
				679	vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
				680	veor $Xl,$Xm,$t2
				681
				682	vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
				683	vpmull.p64 $Xl,$Xl,$xC2
				684	veor $t2,$t2,$Xh
				685	veor $Xl,$Xl,$t2
				686	vext.8 $Xl,$Xl,$Xl,#8
				687
				688	#ifndef __ARMEB__
				689	vrev64.8 $Xl,$Xl
				690	#endif
				691	vst1.64 {$Xl},[$Xi] @ write out Xi
				692
				693	ret
				694	.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
				695	___
				696
				697	}
				698	}
				699
				700	$code.=<<___;
				701	.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
				702	.align 2
				703	#endif
				704	___
				705
				706	if ($flavour =~ /64/) { ######## 64-bit code
				707	sub unvmov {
				708	my $arg=shift;
				709
				710	$arg =~ m/q([0-9]+)#(lo\|hi),\s*q([0-9]+)#(lo\|hi)/o &&
				711	sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
				712	$3<8?$3:$3+8,($4 eq "lo")?0:1;
				713	}
				714	foreach(split("\n",$code)) {
				715	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
				716	s/vmov\.i8/movi/o or # fix up legacy mnemonics
				717	s/vmov\s+(.*)/unvmov($1)/geo or
				718	s/vext\.8/ext/o or
				719	s/vshr\.s/sshr\.s/o or
				720	s/vshr/ushr/o or
				721	s/^(\s+)v/$1/o or # strip off v prefix
				722	s/\bbx\s+lr\b/ret/o;
				723
				724	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
				725	s/@\s/\/\//o; # old->new style commentary
				726
				727	# fix up remaining legacy suffixes
				728	s/\.[ui]?8(\s)/$1/o;
				729	s/\.[uis]?32//o and s/\.16b/\.4s/go;
				730	m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
				731	m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
				732	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
				733	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
				734
				735	print $_,"\n";
				736	}
				737	} else { ######## 32-bit code
				738	sub unvdup32 {
				739	my $arg=shift;
				740
				741	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
				742	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
				743	}
				744	sub unvpmullp64 {
				745	my ($mnemonic,$arg)=@_;
				746
				747	if ($arg =~ m/q([0-9]+),\sq([0-9]+),\sq([0-9]+)/o) {
				748	my $word = 0xf2a00e00\|(($1&7)<<13)\|(($1&8)<<19)
				749	\|(($2&7)<<17)\|(($2&8)<<4)
				750	\|(($3&7)<<1) \|(($3&8)<<2);
				751	$word \|= 0x00010001 if ($mnemonic =~ "2");
				752	# since ARMv7 instructions are always encoded little-endian.
				753	# correct solution is to use .inst directive, but older
				754	# assemblers don't implement it:-(
				755	sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
				756	$word&0xff,($word>>8)&0xff,
				757	($word>>16)&0xff,($word>>24)&0xff,
				758	$mnemonic,$arg;
				759	}
				760	}
				761
				762	foreach(split("\n",$code)) {
				763	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
				764	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
				765	s/\/\/\s?/@ /o; # new->old style commentary
				766
				767	# fix up remaining new-style suffixes
				768	s/\],#[0-9]+/]!/o;
				769
				770	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
				771	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
				772	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
				773	s/\bq([0-9]+)#(lo\|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
				774	s/^(\s+)b\./$1b/o or
				775	s/^(\s+)ret/$1bx\tlr/o;
				776
				777	print $_,"\n";
				778	}
				779	}
				780
				781	close STDOUT or die "error closing STDOUT: $!"; # enforce flush