yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | # |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | # |
| 17 | # On 21264 RSA sign performance improves by 70/35/20/15 percent for |
| 18 | # 512/1024/2048/4096 bit key lengths. This is against vendor compiler |
| 19 | # instructed to '-tune host' code with in-line assembler. Other |
| 20 | # benchmarks improve by 15-20%. To anchor it to something else, the |
| 21 | # code provides approximately the same performance per GHz as AMD64. |
| 22 | # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x |
| 23 | # difference. |
| 24 | |
| 25 | $output=pop; |
| 26 | open STDOUT,">$output"; |
| 27 | |
| 28 | # int bn_mul_mont( |
| 29 | $rp="a0"; # BN_ULONG *rp, |
| 30 | $ap="a1"; # const BN_ULONG *ap, |
| 31 | $bp="a2"; # const BN_ULONG *bp, |
| 32 | $np="a3"; # const BN_ULONG *np, |
| 33 | $n0="a4"; # const BN_ULONG *n0, |
| 34 | $num="a5"; # int num); |
| 35 | |
| 36 | $lo0="t0"; |
| 37 | $hi0="t1"; |
| 38 | $lo1="t2"; |
| 39 | $hi1="t3"; |
| 40 | $aj="t4"; |
| 41 | $bi="t5"; |
| 42 | $nj="t6"; |
| 43 | $tp="t7"; |
| 44 | $alo="t8"; |
| 45 | $ahi="t9"; |
| 46 | $nlo="t10"; |
| 47 | $nhi="t11"; |
| 48 | $tj="t12"; |
| 49 | $i="s3"; |
| 50 | $j="s4"; |
| 51 | $m1="s5"; |
| 52 | |
| 53 | $code=<<___; |
| 54 | #ifdef __linux__ |
| 55 | #include <asm/regdef.h> |
| 56 | #else |
| 57 | #include <asm.h> |
| 58 | #include <regdef.h> |
| 59 | #endif |
| 60 | |
| 61 | .text |
| 62 | |
| 63 | .set noat |
| 64 | .set noreorder |
| 65 | |
| 66 | .globl bn_mul_mont |
| 67 | .align 5 |
| 68 | .ent bn_mul_mont |
| 69 | bn_mul_mont: |
| 70 | lda sp,-48(sp) |
| 71 | stq ra,0(sp) |
| 72 | stq s3,8(sp) |
| 73 | stq s4,16(sp) |
| 74 | stq s5,24(sp) |
| 75 | stq fp,32(sp) |
| 76 | mov sp,fp |
| 77 | .mask 0x0400f000,-48 |
| 78 | .frame fp,48,ra |
| 79 | .prologue 0 |
| 80 | |
| 81 | .align 4 |
| 82 | .set reorder |
| 83 | sextl $num,$num |
| 84 | mov 0,v0 |
| 85 | cmplt $num,4,AT |
| 86 | bne AT,.Lexit |
| 87 | |
| 88 | ldq $hi0,0($ap) # ap[0] |
| 89 | s8addq $num,16,AT |
| 90 | ldq $aj,8($ap) |
| 91 | subq sp,AT,sp |
| 92 | ldq $bi,0($bp) # bp[0] |
| 93 | lda AT,-4096(zero) # mov -4096,AT |
| 94 | ldq $n0,0($n0) |
| 95 | and sp,AT,sp |
| 96 | |
| 97 | mulq $hi0,$bi,$lo0 |
| 98 | ldq $hi1,0($np) # np[0] |
| 99 | umulh $hi0,$bi,$hi0 |
| 100 | ldq $nj,8($np) |
| 101 | |
| 102 | mulq $lo0,$n0,$m1 |
| 103 | |
| 104 | mulq $hi1,$m1,$lo1 |
| 105 | umulh $hi1,$m1,$hi1 |
| 106 | |
| 107 | addq $lo1,$lo0,$lo1 |
| 108 | cmpult $lo1,$lo0,AT |
| 109 | addq $hi1,AT,$hi1 |
| 110 | |
| 111 | mulq $aj,$bi,$alo |
| 112 | mov 2,$j |
| 113 | umulh $aj,$bi,$ahi |
| 114 | mov sp,$tp |
| 115 | |
| 116 | mulq $nj,$m1,$nlo |
| 117 | s8addq $j,$ap,$aj |
| 118 | umulh $nj,$m1,$nhi |
| 119 | s8addq $j,$np,$nj |
| 120 | .align 4 |
| 121 | .L1st: |
| 122 | .set noreorder |
| 123 | ldq $aj,0($aj) |
| 124 | addl $j,1,$j |
| 125 | ldq $nj,0($nj) |
| 126 | lda $tp,8($tp) |
| 127 | |
| 128 | addq $alo,$hi0,$lo0 |
| 129 | mulq $aj,$bi,$alo |
| 130 | cmpult $lo0,$hi0,AT |
| 131 | addq $nlo,$hi1,$lo1 |
| 132 | |
| 133 | mulq $nj,$m1,$nlo |
| 134 | addq $ahi,AT,$hi0 |
| 135 | cmpult $lo1,$hi1,v0 |
| 136 | cmplt $j,$num,$tj |
| 137 | |
| 138 | umulh $aj,$bi,$ahi |
| 139 | addq $nhi,v0,$hi1 |
| 140 | addq $lo1,$lo0,$lo1 |
| 141 | s8addq $j,$ap,$aj |
| 142 | |
| 143 | umulh $nj,$m1,$nhi |
| 144 | cmpult $lo1,$lo0,v0 |
| 145 | addq $hi1,v0,$hi1 |
| 146 | s8addq $j,$np,$nj |
| 147 | |
| 148 | stq $lo1,-8($tp) |
| 149 | nop |
| 150 | unop |
| 151 | bne $tj,.L1st |
| 152 | .set reorder |
| 153 | |
| 154 | addq $alo,$hi0,$lo0 |
| 155 | addq $nlo,$hi1,$lo1 |
| 156 | cmpult $lo0,$hi0,AT |
| 157 | cmpult $lo1,$hi1,v0 |
| 158 | addq $ahi,AT,$hi0 |
| 159 | addq $nhi,v0,$hi1 |
| 160 | |
| 161 | addq $lo1,$lo0,$lo1 |
| 162 | cmpult $lo1,$lo0,v0 |
| 163 | addq $hi1,v0,$hi1 |
| 164 | |
| 165 | stq $lo1,0($tp) |
| 166 | |
| 167 | addq $hi1,$hi0,$hi1 |
| 168 | cmpult $hi1,$hi0,AT |
| 169 | stq $hi1,8($tp) |
| 170 | stq AT,16($tp) |
| 171 | |
| 172 | mov 1,$i |
| 173 | .align 4 |
| 174 | .Louter: |
| 175 | s8addq $i,$bp,$bi |
| 176 | ldq $hi0,0($ap) |
| 177 | ldq $aj,8($ap) |
| 178 | ldq $bi,0($bi) |
| 179 | ldq $hi1,0($np) |
| 180 | ldq $nj,8($np) |
| 181 | ldq $tj,0(sp) |
| 182 | |
| 183 | mulq $hi0,$bi,$lo0 |
| 184 | umulh $hi0,$bi,$hi0 |
| 185 | |
| 186 | addq $lo0,$tj,$lo0 |
| 187 | cmpult $lo0,$tj,AT |
| 188 | addq $hi0,AT,$hi0 |
| 189 | |
| 190 | mulq $lo0,$n0,$m1 |
| 191 | |
| 192 | mulq $hi1,$m1,$lo1 |
| 193 | umulh $hi1,$m1,$hi1 |
| 194 | |
| 195 | addq $lo1,$lo0,$lo1 |
| 196 | cmpult $lo1,$lo0,AT |
| 197 | mov 2,$j |
| 198 | addq $hi1,AT,$hi1 |
| 199 | |
| 200 | mulq $aj,$bi,$alo |
| 201 | mov sp,$tp |
| 202 | umulh $aj,$bi,$ahi |
| 203 | |
| 204 | mulq $nj,$m1,$nlo |
| 205 | s8addq $j,$ap,$aj |
| 206 | umulh $nj,$m1,$nhi |
| 207 | .align 4 |
| 208 | .Linner: |
| 209 | .set noreorder |
| 210 | ldq $tj,8($tp) #L0 |
| 211 | nop #U1 |
| 212 | ldq $aj,0($aj) #L1 |
| 213 | s8addq $j,$np,$nj #U0 |
| 214 | |
| 215 | ldq $nj,0($nj) #L0 |
| 216 | nop #U1 |
| 217 | addq $alo,$hi0,$lo0 #L1 |
| 218 | lda $tp,8($tp) |
| 219 | |
| 220 | mulq $aj,$bi,$alo #U1 |
| 221 | cmpult $lo0,$hi0,AT #L0 |
| 222 | addq $nlo,$hi1,$lo1 #L1 |
| 223 | addl $j,1,$j |
| 224 | |
| 225 | mulq $nj,$m1,$nlo #U1 |
| 226 | addq $ahi,AT,$hi0 #L0 |
| 227 | addq $lo0,$tj,$lo0 #L1 |
| 228 | cmpult $lo1,$hi1,v0 #U0 |
| 229 | |
| 230 | umulh $aj,$bi,$ahi #U1 |
| 231 | cmpult $lo0,$tj,AT #L0 |
| 232 | addq $lo1,$lo0,$lo1 #L1 |
| 233 | addq $nhi,v0,$hi1 #U0 |
| 234 | |
| 235 | umulh $nj,$m1,$nhi #U1 |
| 236 | s8addq $j,$ap,$aj #L0 |
| 237 | cmpult $lo1,$lo0,v0 #L1 |
| 238 | cmplt $j,$num,$tj #U0 # borrow $tj |
| 239 | |
| 240 | addq $hi0,AT,$hi0 #L0 |
| 241 | addq $hi1,v0,$hi1 #U1 |
| 242 | stq $lo1,-8($tp) #L1 |
| 243 | bne $tj,.Linner #U0 |
| 244 | .set reorder |
| 245 | |
| 246 | ldq $tj,8($tp) |
| 247 | addq $alo,$hi0,$lo0 |
| 248 | addq $nlo,$hi1,$lo1 |
| 249 | cmpult $lo0,$hi0,AT |
| 250 | cmpult $lo1,$hi1,v0 |
| 251 | addq $ahi,AT,$hi0 |
| 252 | addq $nhi,v0,$hi1 |
| 253 | |
| 254 | addq $lo0,$tj,$lo0 |
| 255 | cmpult $lo0,$tj,AT |
| 256 | addq $hi0,AT,$hi0 |
| 257 | |
| 258 | ldq $tj,16($tp) |
| 259 | addq $lo1,$lo0,$j |
| 260 | cmpult $j,$lo0,v0 |
| 261 | addq $hi1,v0,$hi1 |
| 262 | |
| 263 | addq $hi1,$hi0,$lo1 |
| 264 | stq $j,0($tp) |
| 265 | cmpult $lo1,$hi0,$hi1 |
| 266 | addq $lo1,$tj,$lo1 |
| 267 | cmpult $lo1,$tj,AT |
| 268 | addl $i,1,$i |
| 269 | addq $hi1,AT,$hi1 |
| 270 | stq $lo1,8($tp) |
| 271 | cmplt $i,$num,$tj # borrow $tj |
| 272 | stq $hi1,16($tp) |
| 273 | bne $tj,.Louter |
| 274 | |
| 275 | s8addq $num,sp,$tj # &tp[num] |
| 276 | mov $rp,$bp # put rp aside |
| 277 | mov sp,$tp |
| 278 | mov sp,$ap |
| 279 | mov 0,$hi0 # clear borrow bit |
| 280 | |
| 281 | .align 4 |
| 282 | .Lsub: ldq $lo0,0($tp) |
| 283 | ldq $lo1,0($np) |
| 284 | lda $tp,8($tp) |
| 285 | lda $np,8($np) |
| 286 | subq $lo0,$lo1,$lo1 # tp[i]-np[i] |
| 287 | cmpult $lo0,$lo1,AT |
| 288 | subq $lo1,$hi0,$lo0 |
| 289 | cmpult $lo1,$lo0,$hi0 |
| 290 | or $hi0,AT,$hi0 |
| 291 | stq $lo0,0($rp) |
| 292 | cmpult $tp,$tj,v0 |
| 293 | lda $rp,8($rp) |
| 294 | bne v0,.Lsub |
| 295 | |
| 296 | subq $hi1,$hi0,$hi0 # handle upmost overflow bit |
| 297 | mov sp,$tp |
| 298 | mov $bp,$rp # restore rp |
| 299 | |
| 300 | .align 4 |
| 301 | .Lcopy: ldq $aj,0($tp) # conditional copy |
| 302 | ldq $nj,0($rp) |
| 303 | lda $tp,8($tp) |
| 304 | lda $rp,8($rp) |
| 305 | cmoveq $hi0,$nj,$aj |
| 306 | stq zero,-8($tp) # zap tp |
| 307 | cmpult $tp,$tj,AT |
| 308 | stq $aj,-8($rp) |
| 309 | bne AT,.Lcopy |
| 310 | mov 1,v0 |
| 311 | |
| 312 | .Lexit: |
| 313 | .set noreorder |
| 314 | mov fp,sp |
| 315 | /*ldq ra,0(sp)*/ |
| 316 | ldq s3,8(sp) |
| 317 | ldq s4,16(sp) |
| 318 | ldq s5,24(sp) |
| 319 | ldq fp,32(sp) |
| 320 | lda sp,48(sp) |
| 321 | ret (ra) |
| 322 | .end bn_mul_mont |
| 323 | .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" |
| 324 | .align 2 |
| 325 | ___ |
| 326 | |
| 327 | print $code; |
| 328 | close STDOUT or die "error closing STDOUT: $!"; |