yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | |
| 17 | # January 2007. |
| 18 | |
| 19 | # Montgomery multiplication for ARMv4. |
| 20 | # |
| 21 | # Performance improvement naturally varies among CPU implementations |
| 22 | # and compilers. The code was observed to provide +65-35% improvement |
| 23 | # [depending on key length, less for longer keys] on ARM920T, and |
| 24 | # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code |
| 25 | # base and compiler generated code with in-lined umull and even umlal |
| 26 | # instructions. The latter means that this code didn't really have an |
| 27 | # "advantage" of utilizing some "secret" instruction. |
| 28 | # |
| 29 | # The code is interoperable with Thumb ISA and is rather compact, less |
| 30 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively |
| 31 | # about decorations, ABI and instruction syntax are identical. |
| 32 | |
| 33 | # November 2013 |
| 34 | # |
| 35 | # Add NEON code path, which handles lengths divisible by 8. RSA/DSA |
| 36 | # performance improvement on Cortex-A8 is ~45-100% depending on key |
| 37 | # length, more for longer keys. On Cortex-A15 the span is ~10-105%. |
| 38 | # On Snapdragon S4 improvement was measured to vary from ~70% to |
| 39 | # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is |
| 40 | # rather because original integer-only code seems to perform |
| 41 | # suboptimally on S4. Situation on Cortex-A9 is unfortunately |
| 42 | # different. It's being looked into, but the trouble is that |
| 43 | # performance for vectors longer than 256 bits is actually couple |
| 44 | # of percent worse than for integer-only code. The code is chosen |
| 45 | # for execution on all NEON-capable processors, because gain on |
| 46 | # others outweighs the marginal loss on Cortex-A9. |
| 47 | |
| 48 | # September 2015 |
| 49 | # |
| 50 | # Align Cortex-A9 performance with November 2013 improvements, i.e. |
| 51 | # NEON code is now ~20-105% faster than integer-only one on this |
| 52 | # processor. But this optimization further improved performance even |
| 53 | # on other processors: NEON code path is ~45-180% faster than original |
| 54 | # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on |
| 55 | # Snapdragon S4. |
| 56 | |
| 57 | $flavour = shift; |
| 58 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } |
| 59 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } |
| 60 | |
| 61 | if ($flavour && $flavour ne "void") { |
| 62 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 63 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| 64 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or |
| 65 | die "can't locate arm-xlate.pl"; |
| 66 | |
| 67 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; |
| 68 | } else { |
| 69 | open STDOUT,">$output"; |
| 70 | } |
| 71 | |
| 72 | $num="r0"; # starts as num argument, but holds &tp[num-1] |
| 73 | $ap="r1"; |
| 74 | $bp="r2"; $bi="r2"; $rp="r2"; |
| 75 | $np="r3"; |
| 76 | $tp="r4"; |
| 77 | $aj="r5"; |
| 78 | $nj="r6"; |
| 79 | $tj="r7"; |
| 80 | $n0="r8"; |
| 81 | ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer |
| 82 | $alo="r10"; # sl, gcc uses it to keep @GOT |
| 83 | $ahi="r11"; # fp |
| 84 | $nlo="r12"; # ip |
| 85 | ########### # r13 is stack pointer |
| 86 | $nhi="r14"; # lr |
| 87 | ########### # r15 is program counter |
| 88 | |
| 89 | #### argument block layout relative to &tp[num-1], a.k.a. $num |
| 90 | $_rp="$num,#12*4"; |
| 91 | # ap permanently resides in r1 |
| 92 | $_bp="$num,#13*4"; |
| 93 | # np permanently resides in r3 |
| 94 | $_n0="$num,#14*4"; |
| 95 | $_num="$num,#15*4"; $_bpend=$_num; |
| 96 | |
| 97 | $code=<<___; |
| 98 | #include "arm_arch.h" |
| 99 | |
| 100 | .text |
| 101 | #if defined(__thumb2__) |
| 102 | .syntax unified |
| 103 | .thumb |
| 104 | #else |
| 105 | .code 32 |
| 106 | #endif |
| 107 | |
| 108 | #if __ARM_MAX_ARCH__>=7 |
| 109 | .align 5 |
| 110 | .LOPENSSL_armcap: |
| 111 | .word OPENSSL_armcap_P-.Lbn_mul_mont |
| 112 | #endif |
| 113 | |
| 114 | .global bn_mul_mont |
| 115 | .type bn_mul_mont,%function |
| 116 | |
| 117 | .align 5 |
| 118 | bn_mul_mont: |
| 119 | .Lbn_mul_mont: |
| 120 | ldr ip,[sp,#4] @ load num |
| 121 | stmdb sp!,{r0,r2} @ sp points at argument block |
| 122 | #if __ARM_MAX_ARCH__>=7 |
| 123 | tst ip,#7 |
| 124 | bne .Lialu |
| 125 | adr r0,.Lbn_mul_mont |
| 126 | ldr r2,.LOPENSSL_armcap |
| 127 | ldr r0,[r0,r2] |
| 128 | #ifdef __APPLE__ |
| 129 | ldr r0,[r0] |
| 130 | #endif |
| 131 | tst r0,#ARMV7_NEON @ NEON available? |
| 132 | ldmia sp, {r0,r2} |
| 133 | beq .Lialu |
| 134 | add sp,sp,#8 |
| 135 | b bn_mul8x_mont_neon |
| 136 | .align 4 |
| 137 | .Lialu: |
| 138 | #endif |
| 139 | cmp ip,#2 |
| 140 | mov $num,ip @ load num |
| 141 | #ifdef __thumb2__ |
| 142 | ittt lt |
| 143 | #endif |
| 144 | movlt r0,#0 |
| 145 | addlt sp,sp,#2*4 |
| 146 | blt .Labrt |
| 147 | |
| 148 | stmdb sp!,{r4-r12,lr} @ save 10 registers |
| 149 | |
| 150 | mov $num,$num,lsl#2 @ rescale $num for byte count |
| 151 | sub sp,sp,$num @ alloca(4*num) |
| 152 | sub sp,sp,#4 @ +extra dword |
| 153 | sub $num,$num,#4 @ "num=num-1" |
| 154 | add $tp,$bp,$num @ &bp[num-1] |
| 155 | |
| 156 | add $num,sp,$num @ $num to point at &tp[num-1] |
| 157 | ldr $n0,[$_n0] @ &n0 |
| 158 | ldr $bi,[$bp] @ bp[0] |
| 159 | ldr $aj,[$ap],#4 @ ap[0],ap++ |
| 160 | ldr $nj,[$np],#4 @ np[0],np++ |
| 161 | ldr $n0,[$n0] @ *n0 |
| 162 | str $tp,[$_bpend] @ save &bp[num] |
| 163 | |
| 164 | umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] |
| 165 | str $n0,[$_n0] @ save n0 value |
| 166 | mul $n0,$alo,$n0 @ "tp[0]"*n0 |
| 167 | mov $nlo,#0 |
| 168 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" |
| 169 | mov $tp,sp |
| 170 | |
| 171 | .L1st: |
| 172 | ldr $aj,[$ap],#4 @ ap[j],ap++ |
| 173 | mov $alo,$ahi |
| 174 | ldr $nj,[$np],#4 @ np[j],np++ |
| 175 | mov $ahi,#0 |
| 176 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] |
| 177 | mov $nhi,#0 |
| 178 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 |
| 179 | adds $nlo,$nlo,$alo |
| 180 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ |
| 181 | adc $nlo,$nhi,#0 |
| 182 | cmp $tp,$num |
| 183 | bne .L1st |
| 184 | |
| 185 | adds $nlo,$nlo,$ahi |
| 186 | ldr $tp,[$_bp] @ restore bp |
| 187 | mov $nhi,#0 |
| 188 | ldr $n0,[$_n0] @ restore n0 |
| 189 | adc $nhi,$nhi,#0 |
| 190 | str $nlo,[$num] @ tp[num-1]= |
| 191 | mov $tj,sp |
| 192 | str $nhi,[$num,#4] @ tp[num]= |
| 193 | |
| 194 | .Louter: |
| 195 | sub $tj,$num,$tj @ "original" $num-1 value |
| 196 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] |
| 197 | ldr $bi,[$tp,#4]! @ *(++bp) |
| 198 | sub $np,$np,$tj @ "rewind" np to &np[1] |
| 199 | ldr $aj,[$ap,#-4] @ ap[0] |
| 200 | ldr $alo,[sp] @ tp[0] |
| 201 | ldr $nj,[$np,#-4] @ np[0] |
| 202 | ldr $tj,[sp,#4] @ tp[1] |
| 203 | |
| 204 | mov $ahi,#0 |
| 205 | umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] |
| 206 | str $tp,[$_bp] @ save bp |
| 207 | mul $n0,$alo,$n0 |
| 208 | mov $nlo,#0 |
| 209 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" |
| 210 | mov $tp,sp |
| 211 | |
| 212 | .Linner: |
| 213 | ldr $aj,[$ap],#4 @ ap[j],ap++ |
| 214 | adds $alo,$ahi,$tj @ +=tp[j] |
| 215 | ldr $nj,[$np],#4 @ np[j],np++ |
| 216 | mov $ahi,#0 |
| 217 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] |
| 218 | mov $nhi,#0 |
| 219 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 |
| 220 | adc $ahi,$ahi,#0 |
| 221 | ldr $tj,[$tp,#8] @ tp[j+1] |
| 222 | adds $nlo,$nlo,$alo |
| 223 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ |
| 224 | adc $nlo,$nhi,#0 |
| 225 | cmp $tp,$num |
| 226 | bne .Linner |
| 227 | |
| 228 | adds $nlo,$nlo,$ahi |
| 229 | mov $nhi,#0 |
| 230 | ldr $tp,[$_bp] @ restore bp |
| 231 | adc $nhi,$nhi,#0 |
| 232 | ldr $n0,[$_n0] @ restore n0 |
| 233 | adds $nlo,$nlo,$tj |
| 234 | ldr $tj,[$_bpend] @ restore &bp[num] |
| 235 | adc $nhi,$nhi,#0 |
| 236 | str $nlo,[$num] @ tp[num-1]= |
| 237 | str $nhi,[$num,#4] @ tp[num]= |
| 238 | |
| 239 | cmp $tp,$tj |
| 240 | #ifdef __thumb2__ |
| 241 | itt ne |
| 242 | #endif |
| 243 | movne $tj,sp |
| 244 | bne .Louter |
| 245 | |
| 246 | ldr $rp,[$_rp] @ pull rp |
| 247 | mov $aj,sp |
| 248 | add $num,$num,#4 @ $num to point at &tp[num] |
| 249 | sub $aj,$num,$aj @ "original" num value |
| 250 | mov $tp,sp @ "rewind" $tp |
| 251 | mov $ap,$tp @ "borrow" $ap |
| 252 | sub $np,$np,$aj @ "rewind" $np to &np[0] |
| 253 | |
| 254 | subs $tj,$tj,$tj @ "clear" carry flag |
| 255 | .Lsub: ldr $tj,[$tp],#4 |
| 256 | ldr $nj,[$np],#4 |
| 257 | sbcs $tj,$tj,$nj @ tp[j]-np[j] |
| 258 | str $tj,[$rp],#4 @ rp[j]= |
| 259 | teq $tp,$num @ preserve carry |
| 260 | bne .Lsub |
| 261 | sbcs $nhi,$nhi,#0 @ upmost carry |
| 262 | mov $tp,sp @ "rewind" $tp |
| 263 | sub $rp,$rp,$aj @ "rewind" $rp |
| 264 | |
| 265 | .Lcopy: ldr $tj,[$tp] @ conditional copy |
| 266 | ldr $aj,[$rp] |
| 267 | str sp,[$tp],#4 @ zap tp |
| 268 | #ifdef __thumb2__ |
| 269 | it cc |
| 270 | #endif |
| 271 | movcc $aj,$tj |
| 272 | str $aj,[$rp],#4 |
| 273 | teq $tp,$num @ preserve carry |
| 274 | bne .Lcopy |
| 275 | |
| 276 | mov sp,$num |
| 277 | add sp,sp,#4 @ skip over tp[num+1] |
| 278 | ldmia sp!,{r4-r12,lr} @ restore registers |
| 279 | add sp,sp,#2*4 @ skip over {r0,r2} |
| 280 | mov r0,#1 |
| 281 | .Labrt: |
| 282 | #if __ARM_ARCH__>=5 |
| 283 | ret @ bx lr |
| 284 | #else |
| 285 | tst lr,#1 |
| 286 | moveq pc,lr @ be binary compatible with V4, yet |
| 287 | bx lr @ interoperable with Thumb ISA:-) |
| 288 | #endif |
| 289 | .size bn_mul_mont,.-bn_mul_mont |
| 290 | ___ |
| 291 | { |
| 292 | my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); |
| 293 | my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); |
| 294 | my ($Z,$Temp)=("q4","q5"); |
| 295 | my @ACC=map("q$_",(6..13)); |
| 296 | my ($Bi,$Ni,$M0)=map("d$_",(28..31)); |
| 297 | my $zero="$Z#lo"; |
| 298 | my $temp="$Temp#lo"; |
| 299 | |
| 300 | my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); |
| 301 | my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); |
| 302 | |
| 303 | $code.=<<___; |
| 304 | #if __ARM_MAX_ARCH__>=7 |
| 305 | .arch armv7-a |
| 306 | .fpu neon |
| 307 | |
| 308 | .type bn_mul8x_mont_neon,%function |
| 309 | .align 5 |
| 310 | bn_mul8x_mont_neon: |
| 311 | mov ip,sp |
| 312 | stmdb sp!,{r4-r11} |
| 313 | vstmdb sp!,{d8-d15} @ ABI specification says so |
| 314 | ldmia ip,{r4-r5} @ load rest of parameter block |
| 315 | mov ip,sp |
| 316 | |
| 317 | cmp $num,#8 |
| 318 | bhi .LNEON_8n |
| 319 | |
| 320 | @ special case for $num==8, everything is in register bank... |
| 321 | |
| 322 | vld1.32 {${Bi}[0]}, [$bptr,:32]! |
| 323 | veor $zero,$zero,$zero |
| 324 | sub $toutptr,sp,$num,lsl#4 |
| 325 | vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( |
| 326 | and $toutptr,$toutptr,#-64 |
| 327 | vld1.32 {${M0}[0]}, [$n0,:32] |
| 328 | mov sp,$toutptr @ alloca |
| 329 | vzip.16 $Bi,$zero |
| 330 | |
| 331 | vmull.u32 @ACC[0],$Bi,${A0}[0] |
| 332 | vmull.u32 @ACC[1],$Bi,${A0}[1] |
| 333 | vmull.u32 @ACC[2],$Bi,${A1}[0] |
| 334 | vshl.i64 $Ni,@ACC[0]#hi,#16 |
| 335 | vmull.u32 @ACC[3],$Bi,${A1}[1] |
| 336 | |
| 337 | vadd.u64 $Ni,$Ni,@ACC[0]#lo |
| 338 | veor $zero,$zero,$zero |
| 339 | vmul.u32 $Ni,$Ni,$M0 |
| 340 | |
| 341 | vmull.u32 @ACC[4],$Bi,${A2}[0] |
| 342 | vld1.32 {$N0-$N3}, [$nptr]! |
| 343 | vmull.u32 @ACC[5],$Bi,${A2}[1] |
| 344 | vmull.u32 @ACC[6],$Bi,${A3}[0] |
| 345 | vzip.16 $Ni,$zero |
| 346 | vmull.u32 @ACC[7],$Bi,${A3}[1] |
| 347 | |
| 348 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
| 349 | sub $outer,$num,#1 |
| 350 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
| 351 | vmlal.u32 @ACC[2],$Ni,${N1}[0] |
| 352 | vmlal.u32 @ACC[3],$Ni,${N1}[1] |
| 353 | |
| 354 | vmlal.u32 @ACC[4],$Ni,${N2}[0] |
| 355 | vmov $Temp,@ACC[0] |
| 356 | vmlal.u32 @ACC[5],$Ni,${N2}[1] |
| 357 | vmov @ACC[0],@ACC[1] |
| 358 | vmlal.u32 @ACC[6],$Ni,${N3}[0] |
| 359 | vmov @ACC[1],@ACC[2] |
| 360 | vmlal.u32 @ACC[7],$Ni,${N3}[1] |
| 361 | vmov @ACC[2],@ACC[3] |
| 362 | vmov @ACC[3],@ACC[4] |
| 363 | vshr.u64 $temp,$temp,#16 |
| 364 | vmov @ACC[4],@ACC[5] |
| 365 | vmov @ACC[5],@ACC[6] |
| 366 | vadd.u64 $temp,$temp,$Temp#hi |
| 367 | vmov @ACC[6],@ACC[7] |
| 368 | veor @ACC[7],@ACC[7] |
| 369 | vshr.u64 $temp,$temp,#16 |
| 370 | |
| 371 | b .LNEON_outer8 |
| 372 | |
| 373 | .align 4 |
| 374 | .LNEON_outer8: |
| 375 | vld1.32 {${Bi}[0]}, [$bptr,:32]! |
| 376 | veor $zero,$zero,$zero |
| 377 | vzip.16 $Bi,$zero |
| 378 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp |
| 379 | |
| 380 | vmlal.u32 @ACC[0],$Bi,${A0}[0] |
| 381 | vmlal.u32 @ACC[1],$Bi,${A0}[1] |
| 382 | vmlal.u32 @ACC[2],$Bi,${A1}[0] |
| 383 | vshl.i64 $Ni,@ACC[0]#hi,#16 |
| 384 | vmlal.u32 @ACC[3],$Bi,${A1}[1] |
| 385 | |
| 386 | vadd.u64 $Ni,$Ni,@ACC[0]#lo |
| 387 | veor $zero,$zero,$zero |
| 388 | subs $outer,$outer,#1 |
| 389 | vmul.u32 $Ni,$Ni,$M0 |
| 390 | |
| 391 | vmlal.u32 @ACC[4],$Bi,${A2}[0] |
| 392 | vmlal.u32 @ACC[5],$Bi,${A2}[1] |
| 393 | vmlal.u32 @ACC[6],$Bi,${A3}[0] |
| 394 | vzip.16 $Ni,$zero |
| 395 | vmlal.u32 @ACC[7],$Bi,${A3}[1] |
| 396 | |
| 397 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
| 398 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
| 399 | vmlal.u32 @ACC[2],$Ni,${N1}[0] |
| 400 | vmlal.u32 @ACC[3],$Ni,${N1}[1] |
| 401 | |
| 402 | vmlal.u32 @ACC[4],$Ni,${N2}[0] |
| 403 | vmov $Temp,@ACC[0] |
| 404 | vmlal.u32 @ACC[5],$Ni,${N2}[1] |
| 405 | vmov @ACC[0],@ACC[1] |
| 406 | vmlal.u32 @ACC[6],$Ni,${N3}[0] |
| 407 | vmov @ACC[1],@ACC[2] |
| 408 | vmlal.u32 @ACC[7],$Ni,${N3}[1] |
| 409 | vmov @ACC[2],@ACC[3] |
| 410 | vmov @ACC[3],@ACC[4] |
| 411 | vshr.u64 $temp,$temp,#16 |
| 412 | vmov @ACC[4],@ACC[5] |
| 413 | vmov @ACC[5],@ACC[6] |
| 414 | vadd.u64 $temp,$temp,$Temp#hi |
| 415 | vmov @ACC[6],@ACC[7] |
| 416 | veor @ACC[7],@ACC[7] |
| 417 | vshr.u64 $temp,$temp,#16 |
| 418 | |
| 419 | bne .LNEON_outer8 |
| 420 | |
| 421 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp |
| 422 | mov $toutptr,sp |
| 423 | vshr.u64 $temp,@ACC[0]#lo,#16 |
| 424 | mov $inner,$num |
| 425 | vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp |
| 426 | add $tinptr,sp,#96 |
| 427 | vshr.u64 $temp,@ACC[0]#hi,#16 |
| 428 | vzip.16 @ACC[0]#lo,@ACC[0]#hi |
| 429 | |
| 430 | b .LNEON_tail_entry |
| 431 | |
| 432 | .align 4 |
| 433 | .LNEON_8n: |
| 434 | veor @ACC[0],@ACC[0],@ACC[0] |
| 435 | sub $toutptr,sp,#128 |
| 436 | veor @ACC[1],@ACC[1],@ACC[1] |
| 437 | sub $toutptr,$toutptr,$num,lsl#4 |
| 438 | veor @ACC[2],@ACC[2],@ACC[2] |
| 439 | and $toutptr,$toutptr,#-64 |
| 440 | veor @ACC[3],@ACC[3],@ACC[3] |
| 441 | mov sp,$toutptr @ alloca |
| 442 | veor @ACC[4],@ACC[4],@ACC[4] |
| 443 | add $toutptr,$toutptr,#256 |
| 444 | veor @ACC[5],@ACC[5],@ACC[5] |
| 445 | sub $inner,$num,#8 |
| 446 | veor @ACC[6],@ACC[6],@ACC[6] |
| 447 | veor @ACC[7],@ACC[7],@ACC[7] |
| 448 | |
| 449 | .LNEON_8n_init: |
| 450 | vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! |
| 451 | subs $inner,$inner,#8 |
| 452 | vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! |
| 453 | vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! |
| 454 | vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! |
| 455 | bne .LNEON_8n_init |
| 456 | |
| 457 | add $tinptr,sp,#256 |
| 458 | vld1.32 {$A0-$A3},[$aptr]! |
| 459 | add $bnptr,sp,#8 |
| 460 | vld1.32 {${M0}[0]},[$n0,:32] |
| 461 | mov $outer,$num |
| 462 | b .LNEON_8n_outer |
| 463 | |
| 464 | .align 4 |
| 465 | .LNEON_8n_outer: |
| 466 | vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ |
| 467 | veor $zero,$zero,$zero |
| 468 | vzip.16 $Bi,$zero |
| 469 | add $toutptr,sp,#128 |
| 470 | vld1.32 {$N0-$N3},[$nptr]! |
| 471 | |
| 472 | vmlal.u32 @ACC[0],$Bi,${A0}[0] |
| 473 | vmlal.u32 @ACC[1],$Bi,${A0}[1] |
| 474 | veor $zero,$zero,$zero |
| 475 | vmlal.u32 @ACC[2],$Bi,${A1}[0] |
| 476 | vshl.i64 $Ni,@ACC[0]#hi,#16 |
| 477 | vmlal.u32 @ACC[3],$Bi,${A1}[1] |
| 478 | vadd.u64 $Ni,$Ni,@ACC[0]#lo |
| 479 | vmlal.u32 @ACC[4],$Bi,${A2}[0] |
| 480 | vmul.u32 $Ni,$Ni,$M0 |
| 481 | vmlal.u32 @ACC[5],$Bi,${A2}[1] |
| 482 | vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] |
| 483 | vmlal.u32 @ACC[6],$Bi,${A3}[0] |
| 484 | vzip.16 $Ni,$zero |
| 485 | vmlal.u32 @ACC[7],$Bi,${A3}[1] |
| 486 | ___ |
| 487 | for ($i=0; $i<7;) { |
| 488 | $code.=<<___; |
| 489 | vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ |
| 490 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
| 491 | veor $temp,$temp,$temp |
| 492 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
| 493 | vzip.16 $Bi,$temp |
| 494 | vmlal.u32 @ACC[2],$Ni,${N1}[0] |
| 495 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 |
| 496 | vmlal.u32 @ACC[3],$Ni,${N1}[1] |
| 497 | vmlal.u32 @ACC[4],$Ni,${N2}[0] |
| 498 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi |
| 499 | vmlal.u32 @ACC[5],$Ni,${N2}[1] |
| 500 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 |
| 501 | vmlal.u32 @ACC[6],$Ni,${N3}[0] |
| 502 | vmlal.u32 @ACC[7],$Ni,${N3}[1] |
| 503 | vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo |
| 504 | vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] |
| 505 | ___ |
| 506 | push(@ACC,shift(@ACC)); $i++; |
| 507 | $code.=<<___; |
| 508 | vmlal.u32 @ACC[0],$Bi,${A0}[0] |
| 509 | vld1.64 {@ACC[7]},[$tinptr,:128]! |
| 510 | vmlal.u32 @ACC[1],$Bi,${A0}[1] |
| 511 | veor $zero,$zero,$zero |
| 512 | vmlal.u32 @ACC[2],$Bi,${A1}[0] |
| 513 | vshl.i64 $Ni,@ACC[0]#hi,#16 |
| 514 | vmlal.u32 @ACC[3],$Bi,${A1}[1] |
| 515 | vadd.u64 $Ni,$Ni,@ACC[0]#lo |
| 516 | vmlal.u32 @ACC[4],$Bi,${A2}[0] |
| 517 | vmul.u32 $Ni,$Ni,$M0 |
| 518 | vmlal.u32 @ACC[5],$Bi,${A2}[1] |
| 519 | vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] |
| 520 | vmlal.u32 @ACC[6],$Bi,${A3}[0] |
| 521 | vzip.16 $Ni,$zero |
| 522 | vmlal.u32 @ACC[7],$Bi,${A3}[1] |
| 523 | ___ |
| 524 | } |
| 525 | $code.=<<___; |
| 526 | vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] |
| 527 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
| 528 | vld1.32 {$A0-$A3},[$aptr]! |
| 529 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
| 530 | vmlal.u32 @ACC[2],$Ni,${N1}[0] |
| 531 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 |
| 532 | vmlal.u32 @ACC[3],$Ni,${N1}[1] |
| 533 | vmlal.u32 @ACC[4],$Ni,${N2}[0] |
| 534 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi |
| 535 | vmlal.u32 @ACC[5],$Ni,${N2}[1] |
| 536 | vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 |
| 537 | vmlal.u32 @ACC[6],$Ni,${N3}[0] |
| 538 | vmlal.u32 @ACC[7],$Ni,${N3}[1] |
| 539 | vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo |
| 540 | vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] |
| 541 | add $bnptr,sp,#8 @ rewind |
| 542 | ___ |
| 543 | push(@ACC,shift(@ACC)); |
| 544 | $code.=<<___; |
| 545 | sub $inner,$num,#8 |
| 546 | b .LNEON_8n_inner |
| 547 | |
| 548 | .align 4 |
| 549 | .LNEON_8n_inner: |
| 550 | subs $inner,$inner,#8 |
| 551 | vmlal.u32 @ACC[0],$Bi,${A0}[0] |
| 552 | vld1.64 {@ACC[7]},[$tinptr,:128] |
| 553 | vmlal.u32 @ACC[1],$Bi,${A0}[1] |
| 554 | vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] |
| 555 | vmlal.u32 @ACC[2],$Bi,${A1}[0] |
| 556 | vld1.32 {$N0-$N3},[$nptr]! |
| 557 | vmlal.u32 @ACC[3],$Bi,${A1}[1] |
| 558 | it ne |
| 559 | addne $tinptr,$tinptr,#16 @ don't advance in last iteration |
| 560 | vmlal.u32 @ACC[4],$Bi,${A2}[0] |
| 561 | vmlal.u32 @ACC[5],$Bi,${A2}[1] |
| 562 | vmlal.u32 @ACC[6],$Bi,${A3}[0] |
| 563 | vmlal.u32 @ACC[7],$Bi,${A3}[1] |
| 564 | ___ |
| 565 | for ($i=1; $i<8; $i++) { |
| 566 | $code.=<<___; |
| 567 | vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] |
| 568 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
| 569 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
| 570 | vmlal.u32 @ACC[2],$Ni,${N1}[0] |
| 571 | vmlal.u32 @ACC[3],$Ni,${N1}[1] |
| 572 | vmlal.u32 @ACC[4],$Ni,${N2}[0] |
| 573 | vmlal.u32 @ACC[5],$Ni,${N2}[1] |
| 574 | vmlal.u32 @ACC[6],$Ni,${N3}[0] |
| 575 | vmlal.u32 @ACC[7],$Ni,${N3}[1] |
| 576 | vst1.64 {@ACC[0]},[$toutptr,:128]! |
| 577 | ___ |
| 578 | push(@ACC,shift(@ACC)); |
| 579 | $code.=<<___; |
| 580 | vmlal.u32 @ACC[0],$Bi,${A0}[0] |
| 581 | vld1.64 {@ACC[7]},[$tinptr,:128] |
| 582 | vmlal.u32 @ACC[1],$Bi,${A0}[1] |
| 583 | vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] |
| 584 | vmlal.u32 @ACC[2],$Bi,${A1}[0] |
| 585 | it ne |
| 586 | addne $tinptr,$tinptr,#16 @ don't advance in last iteration |
| 587 | vmlal.u32 @ACC[3],$Bi,${A1}[1] |
| 588 | vmlal.u32 @ACC[4],$Bi,${A2}[0] |
| 589 | vmlal.u32 @ACC[5],$Bi,${A2}[1] |
| 590 | vmlal.u32 @ACC[6],$Bi,${A3}[0] |
| 591 | vmlal.u32 @ACC[7],$Bi,${A3}[1] |
| 592 | ___ |
| 593 | } |
| 594 | $code.=<<___; |
| 595 | it eq |
| 596 | subeq $aptr,$aptr,$num,lsl#2 @ rewind |
| 597 | vmlal.u32 @ACC[0],$Ni,${N0}[0] |
| 598 | vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] |
| 599 | vmlal.u32 @ACC[1],$Ni,${N0}[1] |
| 600 | vld1.32 {$A0-$A3},[$aptr]! |
| 601 | vmlal.u32 @ACC[2],$Ni,${N1}[0] |
| 602 | add $bnptr,sp,#8 @ rewind |
| 603 | vmlal.u32 @ACC[3],$Ni,${N1}[1] |
| 604 | vmlal.u32 @ACC[4],$Ni,${N2}[0] |
| 605 | vmlal.u32 @ACC[5],$Ni,${N2}[1] |
| 606 | vmlal.u32 @ACC[6],$Ni,${N3}[0] |
| 607 | vst1.64 {@ACC[0]},[$toutptr,:128]! |
| 608 | vmlal.u32 @ACC[7],$Ni,${N3}[1] |
| 609 | |
| 610 | bne .LNEON_8n_inner |
| 611 | ___ |
| 612 | push(@ACC,shift(@ACC)); |
| 613 | $code.=<<___; |
| 614 | add $tinptr,sp,#128 |
| 615 | vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! |
| 616 | veor q2,q2,q2 @ $N0-$N1 |
| 617 | vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! |
| 618 | veor q3,q3,q3 @ $N2-$N3 |
| 619 | vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! |
| 620 | vst1.64 {@ACC[6]},[$toutptr,:128] |
| 621 | |
| 622 | subs $outer,$outer,#8 |
| 623 | vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! |
| 624 | vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! |
| 625 | vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! |
| 626 | vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! |
| 627 | |
| 628 | itt ne |
| 629 | subne $nptr,$nptr,$num,lsl#2 @ rewind |
| 630 | bne .LNEON_8n_outer |
| 631 | |
| 632 | add $toutptr,sp,#128 |
| 633 | vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame |
| 634 | vshr.u64 $temp,@ACC[0]#lo,#16 |
| 635 | vst1.64 {q2-q3},[sp,:256]! |
| 636 | vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp |
| 637 | vst1.64 {q2-q3}, [sp,:256]! |
| 638 | vshr.u64 $temp,@ACC[0]#hi,#16 |
| 639 | vst1.64 {q2-q3}, [sp,:256]! |
| 640 | vzip.16 @ACC[0]#lo,@ACC[0]#hi |
| 641 | |
| 642 | mov $inner,$num |
| 643 | b .LNEON_tail_entry |
| 644 | |
| 645 | .align 4 |
| 646 | .LNEON_tail: |
| 647 | vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp |
| 648 | vshr.u64 $temp,@ACC[0]#lo,#16 |
| 649 | vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! |
| 650 | vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp |
| 651 | vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! |
| 652 | vshr.u64 $temp,@ACC[0]#hi,#16 |
| 653 | vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! |
| 654 | vzip.16 @ACC[0]#lo,@ACC[0]#hi |
| 655 | |
| 656 | .LNEON_tail_entry: |
| 657 | ___ |
| 658 | for ($i=1; $i<8; $i++) { |
| 659 | $code.=<<___; |
| 660 | vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp |
| 661 | vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! |
| 662 | vshr.u64 $temp,@ACC[1]#lo,#16 |
| 663 | vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp |
| 664 | vshr.u64 $temp,@ACC[1]#hi,#16 |
| 665 | vzip.16 @ACC[1]#lo,@ACC[1]#hi |
| 666 | ___ |
| 667 | push(@ACC,shift(@ACC)); |
| 668 | } |
| 669 | push(@ACC,shift(@ACC)); |
| 670 | $code.=<<___; |
| 671 | vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! |
| 672 | subs $inner,$inner,#8 |
| 673 | vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! |
| 674 | bne .LNEON_tail |
| 675 | |
| 676 | vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit |
| 677 | sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr |
| 678 | subs $aptr,sp,#0 @ clear carry flag |
| 679 | add $bptr,sp,$num,lsl#2 |
| 680 | |
| 681 | .LNEON_sub: |
| 682 | ldmia $aptr!, {r4-r7} |
| 683 | ldmia $nptr!, {r8-r11} |
| 684 | sbcs r8, r4,r8 |
| 685 | sbcs r9, r5,r9 |
| 686 | sbcs r10,r6,r10 |
| 687 | sbcs r11,r7,r11 |
| 688 | teq $aptr,$bptr @ preserves carry |
| 689 | stmia $rptr!, {r8-r11} |
| 690 | bne .LNEON_sub |
| 691 | |
| 692 | ldr r10, [$aptr] @ load top-most bit |
| 693 | mov r11,sp |
| 694 | veor q0,q0,q0 |
| 695 | sub r11,$bptr,r11 @ this is num*4 |
| 696 | veor q1,q1,q1 |
| 697 | mov $aptr,sp |
| 698 | sub $rptr,$rptr,r11 @ rewind $rptr |
| 699 | mov $nptr,$bptr @ second 3/4th of frame |
| 700 | sbcs r10,r10,#0 @ result is carry flag |
| 701 | |
| 702 | .LNEON_copy_n_zap: |
| 703 | ldmia $aptr!, {r4-r7} |
| 704 | ldmia $rptr, {r8-r11} |
| 705 | it cc |
| 706 | movcc r8, r4 |
| 707 | vst1.64 {q0-q1}, [$nptr,:256]! @ wipe |
| 708 | itt cc |
| 709 | movcc r9, r5 |
| 710 | movcc r10,r6 |
| 711 | vst1.64 {q0-q1}, [$nptr,:256]! @ wipe |
| 712 | it cc |
| 713 | movcc r11,r7 |
| 714 | ldmia $aptr, {r4-r7} |
| 715 | stmia $rptr!, {r8-r11} |
| 716 | sub $aptr,$aptr,#16 |
| 717 | ldmia $rptr, {r8-r11} |
| 718 | it cc |
| 719 | movcc r8, r4 |
| 720 | vst1.64 {q0-q1}, [$aptr,:256]! @ wipe |
| 721 | itt cc |
| 722 | movcc r9, r5 |
| 723 | movcc r10,r6 |
| 724 | vst1.64 {q0-q1}, [$nptr,:256]! @ wipe |
| 725 | it cc |
| 726 | movcc r11,r7 |
| 727 | teq $aptr,$bptr @ preserves carry |
| 728 | stmia $rptr!, {r8-r11} |
| 729 | bne .LNEON_copy_n_zap |
| 730 | |
| 731 | mov sp,ip |
| 732 | vldmia sp!,{d8-d15} |
| 733 | ldmia sp!,{r4-r11} |
| 734 | ret @ bx lr |
| 735 | .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon |
| 736 | #endif |
| 737 | ___ |
| 738 | } |
| 739 | $code.=<<___; |
| 740 | .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" |
| 741 | .align 2 |
| 742 | #if __ARM_MAX_ARCH__>=7 |
| 743 | .comm OPENSSL_armcap_P,4,4 |
| 744 | #endif |
| 745 | ___ |
| 746 | |
| 747 | foreach (split("\n",$code)) { |
| 748 | s/\`([^\`]*)\`/eval $1/ge; |
| 749 | |
| 750 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or |
| 751 | s/\bret\b/bx lr/g or |
| 752 | s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 |
| 753 | |
| 754 | print $_,"\n"; |
| 755 | } |
| 756 | |
| 757 | close STDOUT or die "error closing STDOUT: $!"; |