Blame - ap/lib/libssl/openssl-1.1.1o/crypto/aes/asm/aesv8-armx.pl - R306

blob: 2b0e982996d18edcabaef37168e5ef99ee69c5e2 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame^]	1	#! /usr/bin/env perl
				2	# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
				3	#
				4	# Licensed under the OpenSSL license (the "License"). You may not use
				5	# this file except in compliance with the License. You can obtain a copy
				6	# in the file LICENSE in the source distribution or at
				7	# https://www.openssl.org/source/license.html
				8
				9	#
				10	# ====================================================================
				11	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
				12	# project. The module is, however, dual licensed under OpenSSL and
				13	# CRYPTOGAMS licenses depending on where you obtain it. For further
				14	# details see http://www.openssl.org/~appro/cryptogams/.
				15	# ====================================================================
				16	#
				17	# This module implements support for ARMv8 AES instructions. The
				18	# module is endian-agnostic in sense that it supports both big- and
				19	# little-endian cases. As does it support both 32- and 64-bit modes
				20	# of operation. Latter is achieved by limiting amount of utilized
				21	# registers to 16, which implies additional NEON load and integer
				22	# instructions. This has no effect on mighty Apple A7, where results
				23	# are literally equal to the theoretical estimates based on AES
				24	# instruction latencies and issue rates. On Cortex-A53, an in-order
				25	# execution core, this costs up to 10-15%, which is partially
				26	# compensated by implementing dedicated code path for 128-bit
				27	# CBC encrypt case. On Cortex-A57 parallelizable mode performance
				28	# seems to be limited by sheer amount of NEON instructions...
				29	#
				30	# Performance in cycles per byte processed with 128-bit key:
				31	#
				32	# CBC enc CBC dec CTR
				33	# Apple A7 2.39 1.20 1.20
				34	# Cortex-A53 1.32 1.29 1.46
				35	# Cortex-A57(*) 1.95 0.85 0.93
				36	# Denver 1.96 0.86 0.80
				37	# Mongoose 1.33 1.20 1.20
				38	# Kryo 1.26 0.94 1.00
				39	#
				40	# (*) original 3.64/1.34/1.32 results were for r0p0 revision
				41	# and are still same even for updated module;
				42
				43	$flavour = shift;
				44	$output = shift;
				45
				46	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				47	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
				48	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
				49	die "can't locate arm-xlate.pl";
				50
				51	open OUT,"\| \"$^X\" $xlate $flavour $output";
				52	STDOUT=OUT;
				53
				54	$prefix="aes_v8";
				55
				56	$code=<<___;
				57	#include "arm_arch.h"
				58
				59	#if __ARM_MAX_ARCH__>=7
				60	.text
				61	___
				62	$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
				63	$code.=<<___ if ($flavour !~ /64/);
				64	.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
				65	.fpu neon
				66	.code 32
				67	#undef __thumb2__
				68	___
				69
				70	# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
				71	# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
				72	# maintain both 32- and 64-bit codes within single module and
				73	# transliterate common code to either flavour with regex vodoo.
				74	#
				75	{{{
				76	my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
				77	my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
				78	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
				79
				80
				81	$code.=<<___;
				82	.align 5
				83	.Lrcon:
				84	.long 0x01,0x01,0x01,0x01
				85	.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
				86	.long 0x1b,0x1b,0x1b,0x1b
				87
				88	.globl ${prefix}_set_encrypt_key
				89	.type ${prefix}_set_encrypt_key,%function
				90	.align 5
				91	${prefix}_set_encrypt_key:
				92	.Lenc_key:
				93	___
				94	$code.=<<___ if ($flavour =~ /64/);
				95	stp x29,x30,[sp,#-16]!
				96	add x29,sp,#0
				97	___
				98	$code.=<<___;
				99	mov $ptr,#-1
				100	cmp $inp,#0
				101	b.eq .Lenc_key_abort
				102	cmp $out,#0
				103	b.eq .Lenc_key_abort
				104	mov $ptr,#-2
				105	cmp $bits,#128
				106	b.lt .Lenc_key_abort
				107	cmp $bits,#256
				108	b.gt .Lenc_key_abort
				109	tst $bits,#0x3f
				110	b.ne .Lenc_key_abort
				111
				112	adr $ptr,.Lrcon
				113	cmp $bits,#192
				114
				115	veor $zero,$zero,$zero
				116	vld1.8 {$in0},[$inp],#16
				117	mov $bits,#8 // reuse $bits
				118	vld1.32 {$rcon,$mask},[$ptr],#32
				119
				120	b.lt .Loop128
				121	b.eq .L192
				122	b .L256
				123
				124	.align 4
				125	.Loop128:
				126	vtbl.8 $key,{$in0},$mask
				127	vext.8 $tmp,$zero,$in0,#12
				128	vst1.32 {$in0},[$out],#16
				129	aese $key,$zero
				130	subs $bits,$bits,#1
				131
				132	veor $in0,$in0,$tmp
				133	vext.8 $tmp,$zero,$tmp,#12
				134	veor $in0,$in0,$tmp
				135	vext.8 $tmp,$zero,$tmp,#12
				136	veor $key,$key,$rcon
				137	veor $in0,$in0,$tmp
				138	vshl.u8 $rcon,$rcon,#1
				139	veor $in0,$in0,$key
				140	b.ne .Loop128
				141
				142	vld1.32 {$rcon},[$ptr]
				143
				144	vtbl.8 $key,{$in0},$mask
				145	vext.8 $tmp,$zero,$in0,#12
				146	vst1.32 {$in0},[$out],#16
				147	aese $key,$zero
				148
				149	veor $in0,$in0,$tmp
				150	vext.8 $tmp,$zero,$tmp,#12
				151	veor $in0,$in0,$tmp
				152	vext.8 $tmp,$zero,$tmp,#12
				153	veor $key,$key,$rcon
				154	veor $in0,$in0,$tmp
				155	vshl.u8 $rcon,$rcon,#1
				156	veor $in0,$in0,$key
				157
				158	vtbl.8 $key,{$in0},$mask
				159	vext.8 $tmp,$zero,$in0,#12
				160	vst1.32 {$in0},[$out],#16
				161	aese $key,$zero
				162
				163	veor $in0,$in0,$tmp
				164	vext.8 $tmp,$zero,$tmp,#12
				165	veor $in0,$in0,$tmp
				166	vext.8 $tmp,$zero,$tmp,#12
				167	veor $key,$key,$rcon
				168	veor $in0,$in0,$tmp
				169	veor $in0,$in0,$key
				170	vst1.32 {$in0},[$out]
				171	add $out,$out,#0x50
				172
				173	mov $rounds,#10
				174	b .Ldone
				175
				176	.align 4
				177	.L192:
				178	vld1.8 {$in1},[$inp],#8
				179	vmov.i8 $key,#8 // borrow $key
				180	vst1.32 {$in0},[$out],#16
				181	vsub.i8 $mask,$mask,$key // adjust the mask
				182
				183	.Loop192:
				184	vtbl.8 $key,{$in1},$mask
				185	vext.8 $tmp,$zero,$in0,#12
				186	#ifdef __ARMEB__
				187	vst1.32 {$in1},[$out],#16
				188	sub $out,$out,#8
				189	#else
				190	vst1.32 {$in1},[$out],#8
				191	#endif
				192	aese $key,$zero
				193	subs $bits,$bits,#1
				194
				195	veor $in0,$in0,$tmp
				196	vext.8 $tmp,$zero,$tmp,#12
				197	veor $in0,$in0,$tmp
				198	vext.8 $tmp,$zero,$tmp,#12
				199	veor $in0,$in0,$tmp
				200
				201	vdup.32 $tmp,${in0}[3]
				202	veor $tmp,$tmp,$in1
				203	veor $key,$key,$rcon
				204	vext.8 $in1,$zero,$in1,#12
				205	vshl.u8 $rcon,$rcon,#1
				206	veor $in1,$in1,$tmp
				207	veor $in0,$in0,$key
				208	veor $in1,$in1,$key
				209	vst1.32 {$in0},[$out],#16
				210	b.ne .Loop192
				211
				212	mov $rounds,#12
				213	add $out,$out,#0x20
				214	b .Ldone
				215
				216	.align 4
				217	.L256:
				218	vld1.8 {$in1},[$inp]
				219	mov $bits,#7
				220	mov $rounds,#14
				221	vst1.32 {$in0},[$out],#16
				222
				223	.Loop256:
				224	vtbl.8 $key,{$in1},$mask
				225	vext.8 $tmp,$zero,$in0,#12
				226	vst1.32 {$in1},[$out],#16
				227	aese $key,$zero
				228	subs $bits,$bits,#1
				229
				230	veor $in0,$in0,$tmp
				231	vext.8 $tmp,$zero,$tmp,#12
				232	veor $in0,$in0,$tmp
				233	vext.8 $tmp,$zero,$tmp,#12
				234	veor $key,$key,$rcon
				235	veor $in0,$in0,$tmp
				236	vshl.u8 $rcon,$rcon,#1
				237	veor $in0,$in0,$key
				238	vst1.32 {$in0},[$out],#16
				239	b.eq .Ldone
				240
				241	vdup.32 $key,${in0}[3] // just splat
				242	vext.8 $tmp,$zero,$in1,#12
				243	aese $key,$zero
				244
				245	veor $in1,$in1,$tmp
				246	vext.8 $tmp,$zero,$tmp,#12
				247	veor $in1,$in1,$tmp
				248	vext.8 $tmp,$zero,$tmp,#12
				249	veor $in1,$in1,$tmp
				250
				251	veor $in1,$in1,$key
				252	b .Loop256
				253
				254	.Ldone:
				255	str $rounds,[$out]
				256	mov $ptr,#0
				257
				258	.Lenc_key_abort:
				259	mov x0,$ptr // return value
				260	`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
				261	ret
				262	.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
				263
				264	.globl ${prefix}_set_decrypt_key
				265	.type ${prefix}_set_decrypt_key,%function
				266	.align 5
				267	${prefix}_set_decrypt_key:
				268	___
				269	$code.=<<___ if ($flavour =~ /64/);
				270	.inst 0xd503233f // paciasp
				271	stp x29,x30,[sp,#-16]!
				272	add x29,sp,#0
				273	___
				274	$code.=<<___ if ($flavour !~ /64/);
				275	stmdb sp!,{r4,lr}
				276	___
				277	$code.=<<___;
				278	bl .Lenc_key
				279
				280	cmp x0,#0
				281	b.ne .Ldec_key_abort
				282
				283	sub $out,$out,#240 // restore original $out
				284	mov x4,#-16
				285	add $inp,$out,x12,lsl#4 // end of key schedule
				286
				287	vld1.32 {v0.16b},[$out]
				288	vld1.32 {v1.16b},[$inp]
				289	vst1.32 {v0.16b},[$inp],x4
				290	vst1.32 {v1.16b},[$out],#16
				291
				292	.Loop_imc:
				293	vld1.32 {v0.16b},[$out]
				294	vld1.32 {v1.16b},[$inp]
				295	aesimc v0.16b,v0.16b
				296	aesimc v1.16b,v1.16b
				297	vst1.32 {v0.16b},[$inp],x4
				298	vst1.32 {v1.16b},[$out],#16
				299	cmp $inp,$out
				300	b.hi .Loop_imc
				301
				302	vld1.32 {v0.16b},[$out]
				303	aesimc v0.16b,v0.16b
				304	vst1.32 {v0.16b},[$inp]
				305
				306	eor x0,x0,x0 // return value
				307	.Ldec_key_abort:
				308	___
				309	$code.=<<___ if ($flavour !~ /64/);
				310	ldmia sp!,{r4,pc}
				311	___
				312	$code.=<<___ if ($flavour =~ /64/);
				313	ldp x29,x30,[sp],#16
				314	.inst 0xd50323bf // autiasp
				315	ret
				316	___
				317	$code.=<<___;
				318	.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
				319	___
				320	}}}
				321	{{{
				322	sub gen_block () {
				323	my $dir = shift;
				324	my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
				325	my ($inp,$out,$key)=map("x$_",(0..2));
				326	my $rounds="w3";
				327	my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
				328
				329	$code.=<<___;
				330	.globl ${prefix}_${dir}crypt
				331	.type ${prefix}_${dir}crypt,%function
				332	.align 5
				333	${prefix}_${dir}crypt:
				334	ldr $rounds,[$key,#240]
				335	vld1.32 {$rndkey0},[$key],#16
				336	vld1.8 {$inout},[$inp]
				337	sub $rounds,$rounds,#2
				338	vld1.32 {$rndkey1},[$key],#16
				339
				340	.Loop_${dir}c:
				341	aes$e $inout,$rndkey0
				342	aes$mc $inout,$inout
				343	vld1.32 {$rndkey0},[$key],#16
				344	subs $rounds,$rounds,#2
				345	aes$e $inout,$rndkey1
				346	aes$mc $inout,$inout
				347	vld1.32 {$rndkey1},[$key],#16
				348	b.gt .Loop_${dir}c
				349
				350	aes$e $inout,$rndkey0
				351	aes$mc $inout,$inout
				352	vld1.32 {$rndkey0},[$key]
				353	aes$e $inout,$rndkey1
				354	veor $inout,$inout,$rndkey0
				355
				356	vst1.8 {$inout},[$out]
				357	ret
				358	.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
				359	___
				360	}
				361	&gen_block("en");
				362	&gen_block("de");
				363	}}}
				364	{{{
				365	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
				366	my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
				367	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
				368
				369	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
				370	my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
				371
				372	### q8-q15 preloaded key schedule
				373
				374	$code.=<<___;
				375	.globl ${prefix}_cbc_encrypt
				376	.type ${prefix}_cbc_encrypt,%function
				377	.align 5
				378	${prefix}_cbc_encrypt:
				379	___
				380	$code.=<<___ if ($flavour =~ /64/);
				381	stp x29,x30,[sp,#-16]!
				382	add x29,sp,#0
				383	___
				384	$code.=<<___ if ($flavour !~ /64/);
				385	mov ip,sp
				386	stmdb sp!,{r4-r8,lr}
				387	vstmdb sp!,{d8-d15} @ ABI specification says so
				388	ldmia ip,{r4-r5} @ load remaining args
				389	___
				390	$code.=<<___;
				391	subs $len,$len,#16
				392	mov $step,#16
				393	b.lo .Lcbc_abort
				394	cclr $step,eq
				395
				396	cmp $enc,#0 // en- or decrypting?
				397	ldr $rounds,[$key,#240]
				398	and $len,$len,#-16
				399	vld1.8 {$ivec},[$ivp]
				400	vld1.8 {$dat},[$inp],$step
				401
				402	vld1.32 {q8-q9},[$key] // load key schedule...
				403	sub $rounds,$rounds,#6
				404	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
				405	sub $rounds,$rounds,#2
				406	vld1.32 {q10-q11},[$key_],#32
				407	vld1.32 {q12-q13},[$key_],#32
				408	vld1.32 {q14-q15},[$key_],#32
				409	vld1.32 {$rndlast},[$key_]
				410
				411	add $key_,$key,#32
				412	mov $cnt,$rounds
				413	b.eq .Lcbc_dec
				414
				415	cmp $rounds,#2
				416	veor $dat,$dat,$ivec
				417	veor $rndzero_n_last,q8,$rndlast
				418	b.eq .Lcbc_enc128
				419
				420	vld1.32 {$in0-$in1},[$key_]
				421	add $key_,$key,#16
				422	add $key4,$key,#16*4
				423	add $key5,$key,#16*5
				424	aese $dat,q8
				425	aesmc $dat,$dat
				426	add $key6,$key,#16*6
				427	add $key7,$key,#16*7
				428	b .Lenter_cbc_enc
				429
				430	.align 4
				431	.Loop_cbc_enc:
				432	aese $dat,q8
				433	aesmc $dat,$dat
				434	vst1.8 {$ivec},[$out],#16
				435	.Lenter_cbc_enc:
				436	aese $dat,q9
				437	aesmc $dat,$dat
				438	aese $dat,$in0
				439	aesmc $dat,$dat
				440	vld1.32 {q8},[$key4]
				441	cmp $rounds,#4
				442	aese $dat,$in1
				443	aesmc $dat,$dat
				444	vld1.32 {q9},[$key5]
				445	b.eq .Lcbc_enc192
				446
				447	aese $dat,q8
				448	aesmc $dat,$dat
				449	vld1.32 {q8},[$key6]
				450	aese $dat,q9
				451	aesmc $dat,$dat
				452	vld1.32 {q9},[$key7]
				453	nop
				454
				455	.Lcbc_enc192:
				456	aese $dat,q8
				457	aesmc $dat,$dat
				458	subs $len,$len,#16
				459	aese $dat,q9
				460	aesmc $dat,$dat
				461	cclr $step,eq
				462	aese $dat,q10
				463	aesmc $dat,$dat
				464	aese $dat,q11
				465	aesmc $dat,$dat
				466	vld1.8 {q8},[$inp],$step
				467	aese $dat,q12
				468	aesmc $dat,$dat
				469	veor q8,q8,$rndzero_n_last
				470	aese $dat,q13
				471	aesmc $dat,$dat
				472	vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
				473	aese $dat,q14
				474	aesmc $dat,$dat
				475	aese $dat,q15
				476	veor $ivec,$dat,$rndlast
				477	b.hs .Loop_cbc_enc
				478
				479	vst1.8 {$ivec},[$out],#16
				480	b .Lcbc_done
				481
				482	.align 5
				483	.Lcbc_enc128:
				484	vld1.32 {$in0-$in1},[$key_]
				485	aese $dat,q8
				486	aesmc $dat,$dat
				487	b .Lenter_cbc_enc128
				488	.Loop_cbc_enc128:
				489	aese $dat,q8
				490	aesmc $dat,$dat
				491	vst1.8 {$ivec},[$out],#16
				492	.Lenter_cbc_enc128:
				493	aese $dat,q9
				494	aesmc $dat,$dat
				495	subs $len,$len,#16
				496	aese $dat,$in0
				497	aesmc $dat,$dat
				498	cclr $step,eq
				499	aese $dat,$in1
				500	aesmc $dat,$dat
				501	aese $dat,q10
				502	aesmc $dat,$dat
				503	aese $dat,q11
				504	aesmc $dat,$dat
				505	vld1.8 {q8},[$inp],$step
				506	aese $dat,q12
				507	aesmc $dat,$dat
				508	aese $dat,q13
				509	aesmc $dat,$dat
				510	aese $dat,q14
				511	aesmc $dat,$dat
				512	veor q8,q8,$rndzero_n_last
				513	aese $dat,q15
				514	veor $ivec,$dat,$rndlast
				515	b.hs .Loop_cbc_enc128
				516
				517	vst1.8 {$ivec},[$out],#16
				518	b .Lcbc_done
				519	___
				520	{
				521	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
				522	$code.=<<___;
				523	.align 5
				524	.Lcbc_dec:
				525	vld1.8 {$dat2},[$inp],#16
				526	subs $len,$len,#32 // bias
				527	add $cnt,$rounds,#2
				528	vorr $in1,$dat,$dat
				529	vorr $dat1,$dat,$dat
				530	vorr $in2,$dat2,$dat2
				531	b.lo .Lcbc_dec_tail
				532
				533	vorr $dat1,$dat2,$dat2
				534	vld1.8 {$dat2},[$inp],#16
				535	vorr $in0,$dat,$dat
				536	vorr $in1,$dat1,$dat1
				537	vorr $in2,$dat2,$dat2
				538
				539	.Loop3x_cbc_dec:
				540	aesd $dat0,q8
				541	aesimc $dat0,$dat0
				542	aesd $dat1,q8
				543	aesimc $dat1,$dat1
				544	aesd $dat2,q8
				545	aesimc $dat2,$dat2
				546	vld1.32 {q8},[$key_],#16
				547	subs $cnt,$cnt,#2
				548	aesd $dat0,q9
				549	aesimc $dat0,$dat0
				550	aesd $dat1,q9
				551	aesimc $dat1,$dat1
				552	aesd $dat2,q9
				553	aesimc $dat2,$dat2
				554	vld1.32 {q9},[$key_],#16
				555	b.gt .Loop3x_cbc_dec
				556
				557	aesd $dat0,q8
				558	aesimc $dat0,$dat0
				559	aesd $dat1,q8
				560	aesimc $dat1,$dat1
				561	aesd $dat2,q8
				562	aesimc $dat2,$dat2
				563	veor $tmp0,$ivec,$rndlast
				564	subs $len,$len,#0x30
				565	veor $tmp1,$in0,$rndlast
				566	mov.lo x6,$len // x6, $cnt, is zero at this point
				567	aesd $dat0,q9
				568	aesimc $dat0,$dat0
				569	aesd $dat1,q9
				570	aesimc $dat1,$dat1
				571	aesd $dat2,q9
				572	aesimc $dat2,$dat2
				573	veor $tmp2,$in1,$rndlast
				574	add $inp,$inp,x6 // $inp is adjusted in such way that
				575	// at exit from the loop $dat1-$dat2
				576	// are loaded with last "words"
				577	vorr $ivec,$in2,$in2
				578	mov $key_,$key
				579	aesd $dat0,q12
				580	aesimc $dat0,$dat0
				581	aesd $dat1,q12
				582	aesimc $dat1,$dat1
				583	aesd $dat2,q12
				584	aesimc $dat2,$dat2
				585	vld1.8 {$in0},[$inp],#16
				586	aesd $dat0,q13
				587	aesimc $dat0,$dat0
				588	aesd $dat1,q13
				589	aesimc $dat1,$dat1
				590	aesd $dat2,q13
				591	aesimc $dat2,$dat2
				592	vld1.8 {$in1},[$inp],#16
				593	aesd $dat0,q14
				594	aesimc $dat0,$dat0
				595	aesd $dat1,q14
				596	aesimc $dat1,$dat1
				597	aesd $dat2,q14
				598	aesimc $dat2,$dat2
				599	vld1.8 {$in2},[$inp],#16
				600	aesd $dat0,q15
				601	aesd $dat1,q15
				602	aesd $dat2,q15
				603	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
				604	add $cnt,$rounds,#2
				605	veor $tmp0,$tmp0,$dat0
				606	veor $tmp1,$tmp1,$dat1
				607	veor $dat2,$dat2,$tmp2
				608	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
				609	vst1.8 {$tmp0},[$out],#16
				610	vorr $dat0,$in0,$in0
				611	vst1.8 {$tmp1},[$out],#16
				612	vorr $dat1,$in1,$in1
				613	vst1.8 {$dat2},[$out],#16
				614	vorr $dat2,$in2,$in2
				615	b.hs .Loop3x_cbc_dec
				616
				617	cmn $len,#0x30
				618	b.eq .Lcbc_done
				619	nop
				620
				621	.Lcbc_dec_tail:
				622	aesd $dat1,q8
				623	aesimc $dat1,$dat1
				624	aesd $dat2,q8
				625	aesimc $dat2,$dat2
				626	vld1.32 {q8},[$key_],#16
				627	subs $cnt,$cnt,#2
				628	aesd $dat1,q9
				629	aesimc $dat1,$dat1
				630	aesd $dat2,q9
				631	aesimc $dat2,$dat2
				632	vld1.32 {q9},[$key_],#16
				633	b.gt .Lcbc_dec_tail
				634
				635	aesd $dat1,q8
				636	aesimc $dat1,$dat1
				637	aesd $dat2,q8
				638	aesimc $dat2,$dat2
				639	aesd $dat1,q9
				640	aesimc $dat1,$dat1
				641	aesd $dat2,q9
				642	aesimc $dat2,$dat2
				643	aesd $dat1,q12
				644	aesimc $dat1,$dat1
				645	aesd $dat2,q12
				646	aesimc $dat2,$dat2
				647	cmn $len,#0x20
				648	aesd $dat1,q13
				649	aesimc $dat1,$dat1
				650	aesd $dat2,q13
				651	aesimc $dat2,$dat2
				652	veor $tmp1,$ivec,$rndlast
				653	aesd $dat1,q14
				654	aesimc $dat1,$dat1
				655	aesd $dat2,q14
				656	aesimc $dat2,$dat2
				657	veor $tmp2,$in1,$rndlast
				658	aesd $dat1,q15
				659	aesd $dat2,q15
				660	b.eq .Lcbc_dec_one
				661	veor $tmp1,$tmp1,$dat1
				662	veor $tmp2,$tmp2,$dat2
				663	vorr $ivec,$in2,$in2
				664	vst1.8 {$tmp1},[$out],#16
				665	vst1.8 {$tmp2},[$out],#16
				666	b .Lcbc_done
				667
				668	.Lcbc_dec_one:
				669	veor $tmp1,$tmp1,$dat2
				670	vorr $ivec,$in2,$in2
				671	vst1.8 {$tmp1},[$out],#16
				672
				673	.Lcbc_done:
				674	vst1.8 {$ivec},[$ivp]
				675	.Lcbc_abort:
				676	___
				677	}
				678	$code.=<<___ if ($flavour !~ /64/);
				679	vldmia sp!,{d8-d15}
				680	ldmia sp!,{r4-r8,pc}
				681	___
				682	$code.=<<___ if ($flavour =~ /64/);
				683	ldr x29,[sp],#16
				684	ret
				685	___
				686	$code.=<<___;
				687	.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
				688	___
				689	}}}
				690	{{{
				691	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
				692	my ($rounds,$cnt,$key_)=("w5","w6","x7");
				693	my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
				694	my $step="x12"; # aliases with $tctr2
				695
				696	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
				697	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
				698
				699	my ($dat,$tmp)=($dat0,$tmp0);
				700
				701	### q8-q15 preloaded key schedule
				702
				703	$code.=<<___;
				704	.globl ${prefix}_ctr32_encrypt_blocks
				705	.type ${prefix}_ctr32_encrypt_blocks,%function
				706	.align 5
				707	${prefix}_ctr32_encrypt_blocks:
				708	___
				709	$code.=<<___ if ($flavour =~ /64/);
				710	stp x29,x30,[sp,#-16]!
				711	add x29,sp,#0
				712	___
				713	$code.=<<___ if ($flavour !~ /64/);
				714	mov ip,sp
				715	stmdb sp!,{r4-r10,lr}
				716	vstmdb sp!,{d8-d15} @ ABI specification says so
				717	ldr r4, [ip] @ load remaining arg
				718	___
				719	$code.=<<___;
				720	ldr $rounds,[$key,#240]
				721
				722	ldr $ctr, [$ivp, #12]
				723	#ifdef __ARMEB__
				724	vld1.8 {$dat0},[$ivp]
				725	#else
				726	vld1.32 {$dat0},[$ivp]
				727	#endif
				728	vld1.32 {q8-q9},[$key] // load key schedule...
				729	sub $rounds,$rounds,#4
				730	mov $step,#16
				731	cmp $len,#2
				732	add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
				733	sub $rounds,$rounds,#2
				734	vld1.32 {q12-q13},[$key_],#32
				735	vld1.32 {q14-q15},[$key_],#32
				736	vld1.32 {$rndlast},[$key_]
				737	add $key_,$key,#32
				738	mov $cnt,$rounds
				739	cclr $step,lo
				740	#ifndef __ARMEB__
				741	rev $ctr, $ctr
				742	#endif
				743	add $tctr1, $ctr, #1
				744	vorr $ivec,$dat0,$dat0
				745	rev $tctr1, $tctr1
				746	vmov.32 ${ivec}[3],$tctr1
				747	add $ctr, $ctr, #2
				748	vorr $dat1,$ivec,$ivec
				749	b.ls .Lctr32_tail
				750	rev $tctr2, $ctr
				751	vmov.32 ${ivec}[3],$tctr2
				752	sub $len,$len,#3 // bias
				753	vorr $dat2,$ivec,$ivec
				754	b .Loop3x_ctr32
				755
				756	.align 4
				757	.Loop3x_ctr32:
				758	aese $dat0,q8
				759	aesmc $dat0,$dat0
				760	aese $dat1,q8
				761	aesmc $dat1,$dat1
				762	aese $dat2,q8
				763	aesmc $dat2,$dat2
				764	vld1.32 {q8},[$key_],#16
				765	subs $cnt,$cnt,#2
				766	aese $dat0,q9
				767	aesmc $dat0,$dat0
				768	aese $dat1,q9
				769	aesmc $dat1,$dat1
				770	aese $dat2,q9
				771	aesmc $dat2,$dat2
				772	vld1.32 {q9},[$key_],#16
				773	b.gt .Loop3x_ctr32
				774
				775	aese $dat0,q8
				776	aesmc $tmp0,$dat0
				777	aese $dat1,q8
				778	aesmc $tmp1,$dat1
				779	vld1.8 {$in0},[$inp],#16
				780	add $tctr0,$ctr,#1
				781	aese $dat2,q8
				782	aesmc $dat2,$dat2
				783	vld1.8 {$in1},[$inp],#16
				784	rev $tctr0,$tctr0
				785	aese $tmp0,q9
				786	aesmc $tmp0,$tmp0
				787	aese $tmp1,q9
				788	aesmc $tmp1,$tmp1
				789	vld1.8 {$in2},[$inp],#16
				790	mov $key_,$key
				791	aese $dat2,q9
				792	aesmc $tmp2,$dat2
				793	aese $tmp0,q12
				794	aesmc $tmp0,$tmp0
				795	aese $tmp1,q12
				796	aesmc $tmp1,$tmp1
				797	veor $in0,$in0,$rndlast
				798	add $tctr1,$ctr,#2
				799	aese $tmp2,q12
				800	aesmc $tmp2,$tmp2
				801	veor $in1,$in1,$rndlast
				802	add $ctr,$ctr,#3
				803	aese $tmp0,q13
				804	aesmc $tmp0,$tmp0
				805	aese $tmp1,q13
				806	aesmc $tmp1,$tmp1
				807	veor $in2,$in2,$rndlast
				808	vmov.32 ${ivec}[3], $tctr0
				809	aese $tmp2,q13
				810	aesmc $tmp2,$tmp2
				811	vorr $dat0,$ivec,$ivec
				812	rev $tctr1,$tctr1
				813	aese $tmp0,q14
				814	aesmc $tmp0,$tmp0
				815	vmov.32 ${ivec}[3], $tctr1
				816	rev $tctr2,$ctr
				817	aese $tmp1,q14
				818	aesmc $tmp1,$tmp1
				819	vorr $dat1,$ivec,$ivec
				820	vmov.32 ${ivec}[3], $tctr2
				821	aese $tmp2,q14
				822	aesmc $tmp2,$tmp2
				823	vorr $dat2,$ivec,$ivec
				824	subs $len,$len,#3
				825	aese $tmp0,q15
				826	aese $tmp1,q15
				827	aese $tmp2,q15
				828
				829	veor $in0,$in0,$tmp0
				830	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
				831	vst1.8 {$in0},[$out],#16
				832	veor $in1,$in1,$tmp1
				833	mov $cnt,$rounds
				834	vst1.8 {$in1},[$out],#16
				835	veor $in2,$in2,$tmp2
				836	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
				837	vst1.8 {$in2},[$out],#16
				838	b.hs .Loop3x_ctr32
				839
				840	adds $len,$len,#3
				841	b.eq .Lctr32_done
				842	cmp $len,#1
				843	mov $step,#16
				844	cclr $step,eq
				845
				846	.Lctr32_tail:
				847	aese $dat0,q8
				848	aesmc $dat0,$dat0
				849	aese $dat1,q8
				850	aesmc $dat1,$dat1
				851	vld1.32 {q8},[$key_],#16
				852	subs $cnt,$cnt,#2
				853	aese $dat0,q9
				854	aesmc $dat0,$dat0
				855	aese $dat1,q9
				856	aesmc $dat1,$dat1
				857	vld1.32 {q9},[$key_],#16
				858	b.gt .Lctr32_tail
				859
				860	aese $dat0,q8
				861	aesmc $dat0,$dat0
				862	aese $dat1,q8
				863	aesmc $dat1,$dat1
				864	aese $dat0,q9
				865	aesmc $dat0,$dat0
				866	aese $dat1,q9
				867	aesmc $dat1,$dat1
				868	vld1.8 {$in0},[$inp],$step
				869	aese $dat0,q12
				870	aesmc $dat0,$dat0
				871	aese $dat1,q12
				872	aesmc $dat1,$dat1
				873	vld1.8 {$in1},[$inp]
				874	aese $dat0,q13
				875	aesmc $dat0,$dat0
				876	aese $dat1,q13
				877	aesmc $dat1,$dat1
				878	veor $in0,$in0,$rndlast
				879	aese $dat0,q14
				880	aesmc $dat0,$dat0
				881	aese $dat1,q14
				882	aesmc $dat1,$dat1
				883	veor $in1,$in1,$rndlast
				884	aese $dat0,q15
				885	aese $dat1,q15
				886
				887	cmp $len,#1
				888	veor $in0,$in0,$dat0
				889	veor $in1,$in1,$dat1
				890	vst1.8 {$in0},[$out],#16
				891	b.eq .Lctr32_done
				892	vst1.8 {$in1},[$out]
				893
				894	.Lctr32_done:
				895	___
				896	$code.=<<___ if ($flavour !~ /64/);
				897	vldmia sp!,{d8-d15}
				898	ldmia sp!,{r4-r10,pc}
				899	___
				900	$code.=<<___ if ($flavour =~ /64/);
				901	ldr x29,[sp],#16
				902	ret
				903	___
				904	$code.=<<___;
				905	.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
				906	___
				907	}}}
				908	$code.=<<___;
				909	#endif
				910	___
				911	########################################
				912	if ($flavour =~ /64/) { ######## 64-bit code
				913	my %opcode = (
				914	"aesd" => 0x4e285800, "aese" => 0x4e284800,
				915	"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
				916
				917	local *unaes = sub {
				918	my ($mnemonic,$arg)=@_;
				919
				920	$arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o &&
				921	sprintf ".inst\t0x%08x\t//%s %s",
				922	$opcode{$mnemonic}\|$1\|($2<<5),
				923	$mnemonic,$arg;
				924	};
				925
				926	foreach(split("\n",$code)) {
				927	s/\`([^\`]*)\`/eval($1)/geo;
				928
				929	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
				930	s/@\s/\/\//o; # old->new style commentary
				931
				932	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
				933	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
				934	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
				935	s/vmov\.i8/movi/o or # fix up legacy mnemonics
				936	s/vext\.8/ext/o or
				937	s/vrev32\.8/rev32/o or
				938	s/vtst\.8/cmtst/o or
				939	s/vshr/ushr/o or
				940	s/^(\s+)v/$1/o or # strip off v prefix
				941	s/\bbx\s+lr\b/ret/o;
				942
				943	# fix up remaining legacy suffixes
				944	s/\.[ui]?8//o;
				945	m/\],#8/o and s/\.16b/\.8b/go;
				946	s/\.[ui]?32//o and s/\.16b/\.4s/go;
				947	s/\.[ui]?64//o and s/\.16b/\.2d/go;
				948	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
				949
				950	print $_,"\n";
				951	}
				952	} else { ######## 32-bit code
				953	my %opcode = (
				954	"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
				955	"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
				956
				957	local *unaes = sub {
				958	my ($mnemonic,$arg)=@_;
				959
				960	if ($arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o) {
				961	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
				962	\|(($2&7)<<1) \|(($2&8)<<2);
				963	# since ARMv7 instructions are always encoded little-endian.
				964	# correct solution is to use .inst directive, but older
				965	# assemblers don't implement it:-(
				966	sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
				967	$word&0xff,($word>>8)&0xff,
				968	($word>>16)&0xff,($word>>24)&0xff,
				969	$mnemonic,$arg;
				970	}
				971	};
				972
				973	sub unvtbl {
				974	my $arg=shift;
				975
				976	$arg =~ m/q([0-9]+),\s\{q([0-9]+)\},\sq([0-9]+)/o &&
				977	sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
				978	"vtbl.8 d%d,{q%d},d%d", 2$1,$2,2$3, 2$1+1,$2,2$3+1;
				979	}
				980
				981	sub unvdup32 {
				982	my $arg=shift;
				983
				984	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
				985	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
				986	}
				987
				988	sub unvmov32 {
				989	my $arg=shift;
				990
				991	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
				992	sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
				993	}
				994
				995	foreach(split("\n",$code)) {
				996	s/\`([^\`]*)\`/eval($1)/geo;
				997
				998	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
				999	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
				1000	s/\/\/\s?/@ /o; # new->old style commentary
				1001
				1002	# fix up remaining new-style suffixes
				1003	s/\{q([0-9]+)\},\s\[(.+)\],#8/sprintf "{d%d},[$2]!",2$1/eo or
				1004	s/\],#[0-9]+/]!/o;
				1005
				1006	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
				1007	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
				1008	s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
				1009	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
				1010	s/vmov\.32\s+(.*)/unvmov32($1)/geo or
				1011	s/^(\s+)b\./$1b/o or
				1012	s/^(\s+)mov\./$1mov/o or
				1013	s/^(\s+)ret/$1bx\tlr/o;
				1014
				1015	print $_,"\n";
				1016	}
				1017	}
				1018
				1019	close STDOUT or die "error closing STDOUT: $!";