yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | |
| 17 | # March 2010 |
| 18 | # |
| 19 | # The module implements "4-bit" GCM GHASH function and underlying |
| 20 | # single multiplication operation in GF(2^128). "4-bit" means that it |
| 21 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance |
| 22 | # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU |
| 23 | # and are expressed in cycles per processed byte, less is better: |
| 24 | # |
| 25 | # gcc 3.3.x cc 5.2 this assembler |
| 26 | # |
| 27 | # 32-bit build 81.4 43.3 12.6 (+546%/+244%) |
| 28 | # 64-bit build 20.2 21.2 12.6 (+60%/+68%) |
| 29 | # |
| 30 | # Here is data collected on UltraSPARC T1 system running Linux: |
| 31 | # |
| 32 | # gcc 4.4.1 this assembler |
| 33 | # |
| 34 | # 32-bit build 566 50 (+1000%) |
| 35 | # 64-bit build 56 50 (+12%) |
| 36 | # |
| 37 | # I don't quite understand why difference between 32-bit and 64-bit |
| 38 | # compiler-generated code is so big. Compilers *were* instructed to |
| 39 | # generate code for UltraSPARC and should have used 64-bit registers |
| 40 | # for Z vector (see C code) even in 32-bit build... Oh well, it only |
| 41 | # means more impressive improvement coefficients for this assembler |
| 42 | # module;-) Loops are aggressively modulo-scheduled in respect to |
| 43 | # references to input data and Z.hi updates to achieve 12 cycles |
| 44 | # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 |
| 45 | # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. |
| 46 | # |
| 47 | # October 2012 |
| 48 | # |
| 49 | # Add VIS3 lookup-table-free implementation using polynomial |
| 50 | # multiplication xmulx[hi] and extended addition addxc[cc] |
| 51 | # instructions. 4.52/7.63x improvement on T3/T4 or in absolute |
| 52 | # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark |
| 53 | # saturates at ~15.5x single-process result on 8-core processor, |
| 54 | # or ~20.5GBps per 2.85GHz socket. |
| 55 | |
| 56 | $output=pop; |
| 57 | open STDOUT,">$output"; |
| 58 | |
| 59 | $frame="STACK_FRAME"; |
| 60 | $bias="STACK_BIAS"; |
| 61 | |
| 62 | $Zhi="%o0"; # 64-bit values |
| 63 | $Zlo="%o1"; |
| 64 | $Thi="%o2"; |
| 65 | $Tlo="%o3"; |
| 66 | $rem="%o4"; |
| 67 | $tmp="%o5"; |
| 68 | |
| 69 | $nhi="%l0"; # small values and pointers |
| 70 | $nlo="%l1"; |
| 71 | $xi0="%l2"; |
| 72 | $xi1="%l3"; |
| 73 | $rem_4bit="%l4"; |
| 74 | $remi="%l5"; |
| 75 | $Htblo="%l6"; |
| 76 | $cnt="%l7"; |
| 77 | |
| 78 | $Xi="%i0"; # input argument block |
| 79 | $Htbl="%i1"; |
| 80 | $inp="%i2"; |
| 81 | $len="%i3"; |
| 82 | |
| 83 | $code.=<<___; |
| 84 | #include "sparc_arch.h" |
| 85 | |
| 86 | #ifdef __arch64__ |
| 87 | .register %g2,#scratch |
| 88 | .register %g3,#scratch |
| 89 | #endif |
| 90 | |
| 91 | .section ".text",#alloc,#execinstr |
| 92 | |
| 93 | .align 64 |
| 94 | rem_4bit: |
| 95 | .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 |
| 96 | .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 |
| 97 | .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 |
| 98 | .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 |
| 99 | .type rem_4bit,#object |
| 100 | .size rem_4bit,(.-rem_4bit) |
| 101 | |
| 102 | .globl gcm_ghash_4bit |
| 103 | .align 32 |
| 104 | gcm_ghash_4bit: |
| 105 | save %sp,-$frame,%sp |
| 106 | ldub [$inp+15],$nlo |
| 107 | ldub [$Xi+15],$xi0 |
| 108 | ldub [$Xi+14],$xi1 |
| 109 | add $len,$inp,$len |
| 110 | add $Htbl,8,$Htblo |
| 111 | |
| 112 | 1: call .+8 |
| 113 | add %o7,rem_4bit-1b,$rem_4bit |
| 114 | |
| 115 | .Louter: |
| 116 | xor $xi0,$nlo,$nlo |
| 117 | and $nlo,0xf0,$nhi |
| 118 | and $nlo,0x0f,$nlo |
| 119 | sll $nlo,4,$nlo |
| 120 | ldx [$Htblo+$nlo],$Zlo |
| 121 | ldx [$Htbl+$nlo],$Zhi |
| 122 | |
| 123 | ldub [$inp+14],$nlo |
| 124 | |
| 125 | ldx [$Htblo+$nhi],$Tlo |
| 126 | and $Zlo,0xf,$remi |
| 127 | ldx [$Htbl+$nhi],$Thi |
| 128 | sll $remi,3,$remi |
| 129 | ldx [$rem_4bit+$remi],$rem |
| 130 | srlx $Zlo,4,$Zlo |
| 131 | mov 13,$cnt |
| 132 | sllx $Zhi,60,$tmp |
| 133 | xor $Tlo,$Zlo,$Zlo |
| 134 | srlx $Zhi,4,$Zhi |
| 135 | xor $Zlo,$tmp,$Zlo |
| 136 | |
| 137 | xor $xi1,$nlo,$nlo |
| 138 | and $Zlo,0xf,$remi |
| 139 | and $nlo,0xf0,$nhi |
| 140 | and $nlo,0x0f,$nlo |
| 141 | ba .Lghash_inner |
| 142 | sll $nlo,4,$nlo |
| 143 | .align 32 |
| 144 | .Lghash_inner: |
| 145 | ldx [$Htblo+$nlo],$Tlo |
| 146 | sll $remi,3,$remi |
| 147 | xor $Thi,$Zhi,$Zhi |
| 148 | ldx [$Htbl+$nlo],$Thi |
| 149 | srlx $Zlo,4,$Zlo |
| 150 | xor $rem,$Zhi,$Zhi |
| 151 | ldx [$rem_4bit+$remi],$rem |
| 152 | sllx $Zhi,60,$tmp |
| 153 | xor $Tlo,$Zlo,$Zlo |
| 154 | ldub [$inp+$cnt],$nlo |
| 155 | srlx $Zhi,4,$Zhi |
| 156 | xor $Zlo,$tmp,$Zlo |
| 157 | ldub [$Xi+$cnt],$xi1 |
| 158 | xor $Thi,$Zhi,$Zhi |
| 159 | and $Zlo,0xf,$remi |
| 160 | |
| 161 | ldx [$Htblo+$nhi],$Tlo |
| 162 | sll $remi,3,$remi |
| 163 | xor $rem,$Zhi,$Zhi |
| 164 | ldx [$Htbl+$nhi],$Thi |
| 165 | srlx $Zlo,4,$Zlo |
| 166 | ldx [$rem_4bit+$remi],$rem |
| 167 | sllx $Zhi,60,$tmp |
| 168 | xor $xi1,$nlo,$nlo |
| 169 | srlx $Zhi,4,$Zhi |
| 170 | and $nlo,0xf0,$nhi |
| 171 | addcc $cnt,-1,$cnt |
| 172 | xor $Zlo,$tmp,$Zlo |
| 173 | and $nlo,0x0f,$nlo |
| 174 | xor $Tlo,$Zlo,$Zlo |
| 175 | sll $nlo,4,$nlo |
| 176 | blu .Lghash_inner |
| 177 | and $Zlo,0xf,$remi |
| 178 | |
| 179 | ldx [$Htblo+$nlo],$Tlo |
| 180 | sll $remi,3,$remi |
| 181 | xor $Thi,$Zhi,$Zhi |
| 182 | ldx [$Htbl+$nlo],$Thi |
| 183 | srlx $Zlo,4,$Zlo |
| 184 | xor $rem,$Zhi,$Zhi |
| 185 | ldx [$rem_4bit+$remi],$rem |
| 186 | sllx $Zhi,60,$tmp |
| 187 | xor $Tlo,$Zlo,$Zlo |
| 188 | srlx $Zhi,4,$Zhi |
| 189 | xor $Zlo,$tmp,$Zlo |
| 190 | xor $Thi,$Zhi,$Zhi |
| 191 | |
| 192 | add $inp,16,$inp |
| 193 | cmp $inp,$len |
| 194 | be,pn SIZE_T_CC,.Ldone |
| 195 | and $Zlo,0xf,$remi |
| 196 | |
| 197 | ldx [$Htblo+$nhi],$Tlo |
| 198 | sll $remi,3,$remi |
| 199 | xor $rem,$Zhi,$Zhi |
| 200 | ldx [$Htbl+$nhi],$Thi |
| 201 | srlx $Zlo,4,$Zlo |
| 202 | ldx [$rem_4bit+$remi],$rem |
| 203 | sllx $Zhi,60,$tmp |
| 204 | xor $Tlo,$Zlo,$Zlo |
| 205 | ldub [$inp+15],$nlo |
| 206 | srlx $Zhi,4,$Zhi |
| 207 | xor $Zlo,$tmp,$Zlo |
| 208 | xor $Thi,$Zhi,$Zhi |
| 209 | stx $Zlo,[$Xi+8] |
| 210 | xor $rem,$Zhi,$Zhi |
| 211 | stx $Zhi,[$Xi] |
| 212 | srl $Zlo,8,$xi1 |
| 213 | and $Zlo,0xff,$xi0 |
| 214 | ba .Louter |
| 215 | and $xi1,0xff,$xi1 |
| 216 | .align 32 |
| 217 | .Ldone: |
| 218 | ldx [$Htblo+$nhi],$Tlo |
| 219 | sll $remi,3,$remi |
| 220 | xor $rem,$Zhi,$Zhi |
| 221 | ldx [$Htbl+$nhi],$Thi |
| 222 | srlx $Zlo,4,$Zlo |
| 223 | ldx [$rem_4bit+$remi],$rem |
| 224 | sllx $Zhi,60,$tmp |
| 225 | xor $Tlo,$Zlo,$Zlo |
| 226 | srlx $Zhi,4,$Zhi |
| 227 | xor $Zlo,$tmp,$Zlo |
| 228 | xor $Thi,$Zhi,$Zhi |
| 229 | stx $Zlo,[$Xi+8] |
| 230 | xor $rem,$Zhi,$Zhi |
| 231 | stx $Zhi,[$Xi] |
| 232 | |
| 233 | ret |
| 234 | restore |
| 235 | .type gcm_ghash_4bit,#function |
| 236 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) |
| 237 | ___ |
| 238 | |
| 239 | undef $inp; |
| 240 | undef $len; |
| 241 | |
| 242 | $code.=<<___; |
| 243 | .globl gcm_gmult_4bit |
| 244 | .align 32 |
| 245 | gcm_gmult_4bit: |
| 246 | save %sp,-$frame,%sp |
| 247 | ldub [$Xi+15],$nlo |
| 248 | add $Htbl,8,$Htblo |
| 249 | |
| 250 | 1: call .+8 |
| 251 | add %o7,rem_4bit-1b,$rem_4bit |
| 252 | |
| 253 | and $nlo,0xf0,$nhi |
| 254 | and $nlo,0x0f,$nlo |
| 255 | sll $nlo,4,$nlo |
| 256 | ldx [$Htblo+$nlo],$Zlo |
| 257 | ldx [$Htbl+$nlo],$Zhi |
| 258 | |
| 259 | ldub [$Xi+14],$nlo |
| 260 | |
| 261 | ldx [$Htblo+$nhi],$Tlo |
| 262 | and $Zlo,0xf,$remi |
| 263 | ldx [$Htbl+$nhi],$Thi |
| 264 | sll $remi,3,$remi |
| 265 | ldx [$rem_4bit+$remi],$rem |
| 266 | srlx $Zlo,4,$Zlo |
| 267 | mov 13,$cnt |
| 268 | sllx $Zhi,60,$tmp |
| 269 | xor $Tlo,$Zlo,$Zlo |
| 270 | srlx $Zhi,4,$Zhi |
| 271 | xor $Zlo,$tmp,$Zlo |
| 272 | |
| 273 | and $Zlo,0xf,$remi |
| 274 | and $nlo,0xf0,$nhi |
| 275 | and $nlo,0x0f,$nlo |
| 276 | ba .Lgmult_inner |
| 277 | sll $nlo,4,$nlo |
| 278 | .align 32 |
| 279 | .Lgmult_inner: |
| 280 | ldx [$Htblo+$nlo],$Tlo |
| 281 | sll $remi,3,$remi |
| 282 | xor $Thi,$Zhi,$Zhi |
| 283 | ldx [$Htbl+$nlo],$Thi |
| 284 | srlx $Zlo,4,$Zlo |
| 285 | xor $rem,$Zhi,$Zhi |
| 286 | ldx [$rem_4bit+$remi],$rem |
| 287 | sllx $Zhi,60,$tmp |
| 288 | xor $Tlo,$Zlo,$Zlo |
| 289 | ldub [$Xi+$cnt],$nlo |
| 290 | srlx $Zhi,4,$Zhi |
| 291 | xor $Zlo,$tmp,$Zlo |
| 292 | xor $Thi,$Zhi,$Zhi |
| 293 | and $Zlo,0xf,$remi |
| 294 | |
| 295 | ldx [$Htblo+$nhi],$Tlo |
| 296 | sll $remi,3,$remi |
| 297 | xor $rem,$Zhi,$Zhi |
| 298 | ldx [$Htbl+$nhi],$Thi |
| 299 | srlx $Zlo,4,$Zlo |
| 300 | ldx [$rem_4bit+$remi],$rem |
| 301 | sllx $Zhi,60,$tmp |
| 302 | srlx $Zhi,4,$Zhi |
| 303 | and $nlo,0xf0,$nhi |
| 304 | addcc $cnt,-1,$cnt |
| 305 | xor $Zlo,$tmp,$Zlo |
| 306 | and $nlo,0x0f,$nlo |
| 307 | xor $Tlo,$Zlo,$Zlo |
| 308 | sll $nlo,4,$nlo |
| 309 | blu .Lgmult_inner |
| 310 | and $Zlo,0xf,$remi |
| 311 | |
| 312 | ldx [$Htblo+$nlo],$Tlo |
| 313 | sll $remi,3,$remi |
| 314 | xor $Thi,$Zhi,$Zhi |
| 315 | ldx [$Htbl+$nlo],$Thi |
| 316 | srlx $Zlo,4,$Zlo |
| 317 | xor $rem,$Zhi,$Zhi |
| 318 | ldx [$rem_4bit+$remi],$rem |
| 319 | sllx $Zhi,60,$tmp |
| 320 | xor $Tlo,$Zlo,$Zlo |
| 321 | srlx $Zhi,4,$Zhi |
| 322 | xor $Zlo,$tmp,$Zlo |
| 323 | xor $Thi,$Zhi,$Zhi |
| 324 | and $Zlo,0xf,$remi |
| 325 | |
| 326 | ldx [$Htblo+$nhi],$Tlo |
| 327 | sll $remi,3,$remi |
| 328 | xor $rem,$Zhi,$Zhi |
| 329 | ldx [$Htbl+$nhi],$Thi |
| 330 | srlx $Zlo,4,$Zlo |
| 331 | ldx [$rem_4bit+$remi],$rem |
| 332 | sllx $Zhi,60,$tmp |
| 333 | xor $Tlo,$Zlo,$Zlo |
| 334 | srlx $Zhi,4,$Zhi |
| 335 | xor $Zlo,$tmp,$Zlo |
| 336 | xor $Thi,$Zhi,$Zhi |
| 337 | stx $Zlo,[$Xi+8] |
| 338 | xor $rem,$Zhi,$Zhi |
| 339 | stx $Zhi,[$Xi] |
| 340 | |
| 341 | ret |
| 342 | restore |
| 343 | .type gcm_gmult_4bit,#function |
| 344 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) |
| 345 | ___ |
| 346 | |
| 347 | {{{ |
| 348 | # Straightforward 128x128-bit multiplication using Karatsuba algorithm |
| 349 | # followed by pair of 64-bit reductions [with a shortcut in first one, |
| 350 | # which allowed to break dependency between reductions and remove one |
| 351 | # multiplication from critical path]. While it might be suboptimal |
| 352 | # with regard to sheer number of multiplications, other methods [such |
| 353 | # as aggregate reduction] would require more 64-bit registers, which |
| 354 | # we don't have in 32-bit application context. |
| 355 | |
| 356 | ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); |
| 357 | |
| 358 | ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= |
| 359 | (map("%o$_",(0..5,7)),map("%g$_",(1..5))); |
| 360 | |
| 361 | ($shl,$shr)=map("%l$_",(0..7)); |
| 362 | |
| 363 | # For details regarding "twisted H" see ghash-x86.pl. |
| 364 | $code.=<<___; |
| 365 | .globl gcm_init_vis3 |
| 366 | .align 32 |
| 367 | gcm_init_vis3: |
| 368 | save %sp,-$frame,%sp |
| 369 | |
| 370 | ldx [%i1+0],$Hhi |
| 371 | ldx [%i1+8],$Hlo |
| 372 | mov 0xE1,$Xhi |
| 373 | mov 1,$Xlo |
| 374 | sllx $Xhi,57,$Xhi |
| 375 | srax $Hhi,63,$C0 ! broadcast carry |
| 376 | addcc $Hlo,$Hlo,$Hlo ! H<<=1 |
| 377 | addxc $Hhi,$Hhi,$Hhi |
| 378 | and $C0,$Xlo,$Xlo |
| 379 | and $C0,$Xhi,$Xhi |
| 380 | xor $Xlo,$Hlo,$Hlo |
| 381 | xor $Xhi,$Hhi,$Hhi |
| 382 | stx $Hlo,[%i0+8] ! save twisted H |
| 383 | stx $Hhi,[%i0+0] |
| 384 | |
| 385 | sethi %hi(0xA0406080),$V |
| 386 | sethi %hi(0x20C0E000),%l0 |
| 387 | or $V,%lo(0xA0406080),$V |
| 388 | or %l0,%lo(0x20C0E000),%l0 |
| 389 | sllx $V,32,$V |
| 390 | or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 |
| 391 | stx $V,[%i0+16] |
| 392 | |
| 393 | ret |
| 394 | restore |
| 395 | .type gcm_init_vis3,#function |
| 396 | .size gcm_init_vis3,.-gcm_init_vis3 |
| 397 | |
| 398 | .globl gcm_gmult_vis3 |
| 399 | .align 32 |
| 400 | gcm_gmult_vis3: |
| 401 | save %sp,-$frame,%sp |
| 402 | |
| 403 | ldx [$Xip+8],$Xlo ! load Xi |
| 404 | ldx [$Xip+0],$Xhi |
| 405 | ldx [$Htable+8],$Hlo ! load twisted H |
| 406 | ldx [$Htable+0],$Hhi |
| 407 | |
| 408 | mov 0xE1,%l7 |
| 409 | sllx %l7,57,$xE1 ! 57 is not a typo |
| 410 | ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 |
| 411 | |
| 412 | xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing |
| 413 | xmulx $Xlo,$Hlo,$C0 |
| 414 | xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing |
| 415 | xmulx $C2,$Hhl,$C1 |
| 416 | xmulxhi $Xlo,$Hlo,$Xlo |
| 417 | xmulxhi $C2,$Hhl,$C2 |
| 418 | xmulxhi $Xhi,$Hhi,$C3 |
| 419 | xmulx $Xhi,$Hhi,$Xhi |
| 420 | |
| 421 | sll $C0,3,$sqr |
| 422 | srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] |
| 423 | xor $C0,$sqr,$sqr |
| 424 | sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] |
| 425 | |
| 426 | xor $C0,$C1,$C1 ! Karatsuba post-processing |
| 427 | xor $Xlo,$C2,$C2 |
| 428 | xor $sqr,$Xlo,$Xlo ! real destination is $C1 |
| 429 | xor $C3,$C2,$C2 |
| 430 | xor $Xlo,$C1,$C1 |
| 431 | xor $Xhi,$C2,$C2 |
| 432 | xor $Xhi,$C1,$C1 |
| 433 | |
| 434 | xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 |
| 435 | xor $C0,$C2,$C2 |
| 436 | xmulx $C1,$xE1,$C0 |
| 437 | xor $C1,$C3,$C3 |
| 438 | xmulxhi $C1,$xE1,$C1 |
| 439 | |
| 440 | xor $Xlo,$C2,$C2 |
| 441 | xor $C0,$C2,$C2 |
| 442 | xor $C1,$C3,$C3 |
| 443 | |
| 444 | stx $C2,[$Xip+8] ! save Xi |
| 445 | stx $C3,[$Xip+0] |
| 446 | |
| 447 | ret |
| 448 | restore |
| 449 | .type gcm_gmult_vis3,#function |
| 450 | .size gcm_gmult_vis3,.-gcm_gmult_vis3 |
| 451 | |
| 452 | .globl gcm_ghash_vis3 |
| 453 | .align 32 |
| 454 | gcm_ghash_vis3: |
| 455 | save %sp,-$frame,%sp |
| 456 | nop |
| 457 | srln $len,0,$len ! needed on v8+, "nop" on v9 |
| 458 | |
| 459 | ldx [$Xip+8],$C2 ! load Xi |
| 460 | ldx [$Xip+0],$C3 |
| 461 | ldx [$Htable+8],$Hlo ! load twisted H |
| 462 | ldx [$Htable+0],$Hhi |
| 463 | |
| 464 | mov 0xE1,%l7 |
| 465 | sllx %l7,57,$xE1 ! 57 is not a typo |
| 466 | ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 |
| 467 | |
| 468 | and $inp,7,$shl |
| 469 | andn $inp,7,$inp |
| 470 | sll $shl,3,$shl |
| 471 | prefetch [$inp+63], 20 |
| 472 | sub %g0,$shl,$shr |
| 473 | |
| 474 | xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing |
| 475 | .Loop: |
| 476 | ldx [$inp+8],$Xlo |
| 477 | brz,pt $shl,1f |
| 478 | ldx [$inp+0],$Xhi |
| 479 | |
| 480 | ldx [$inp+16],$C1 ! align data |
| 481 | srlx $Xlo,$shr,$C0 |
| 482 | sllx $Xlo,$shl,$Xlo |
| 483 | sllx $Xhi,$shl,$Xhi |
| 484 | srlx $C1,$shr,$C1 |
| 485 | or $C0,$Xhi,$Xhi |
| 486 | or $C1,$Xlo,$Xlo |
| 487 | 1: |
| 488 | add $inp,16,$inp |
| 489 | sub $len,16,$len |
| 490 | xor $C2,$Xlo,$Xlo |
| 491 | xor $C3,$Xhi,$Xhi |
| 492 | prefetch [$inp+63], 20 |
| 493 | |
| 494 | xmulx $Xlo,$Hlo,$C0 |
| 495 | xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing |
| 496 | xmulx $C2,$Hhl,$C1 |
| 497 | xmulxhi $Xlo,$Hlo,$Xlo |
| 498 | xmulxhi $C2,$Hhl,$C2 |
| 499 | xmulxhi $Xhi,$Hhi,$C3 |
| 500 | xmulx $Xhi,$Hhi,$Xhi |
| 501 | |
| 502 | sll $C0,3,$sqr |
| 503 | srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] |
| 504 | xor $C0,$sqr,$sqr |
| 505 | sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] |
| 506 | |
| 507 | xor $C0,$C1,$C1 ! Karatsuba post-processing |
| 508 | xor $Xlo,$C2,$C2 |
| 509 | xor $sqr,$Xlo,$Xlo ! real destination is $C1 |
| 510 | xor $C3,$C2,$C2 |
| 511 | xor $Xlo,$C1,$C1 |
| 512 | xor $Xhi,$C2,$C2 |
| 513 | xor $Xhi,$C1,$C1 |
| 514 | |
| 515 | xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 |
| 516 | xor $C0,$C2,$C2 |
| 517 | xmulx $C1,$xE1,$C0 |
| 518 | xor $C1,$C3,$C3 |
| 519 | xmulxhi $C1,$xE1,$C1 |
| 520 | |
| 521 | xor $Xlo,$C2,$C2 |
| 522 | xor $C0,$C2,$C2 |
| 523 | brnz,pt $len,.Loop |
| 524 | xor $C1,$C3,$C3 |
| 525 | |
| 526 | stx $C2,[$Xip+8] ! save Xi |
| 527 | stx $C3,[$Xip+0] |
| 528 | |
| 529 | ret |
| 530 | restore |
| 531 | .type gcm_ghash_vis3,#function |
| 532 | .size gcm_ghash_vis3,.-gcm_ghash_vis3 |
| 533 | ___ |
| 534 | }}} |
| 535 | $code.=<<___; |
| 536 | .asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" |
| 537 | .align 4 |
| 538 | ___ |
| 539 | |
| 540 | |
| 541 | # Purpose of these subroutines is to explicitly encode VIS instructions, |
| 542 | # so that one can compile the module without having to specify VIS |
| 543 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. |
| 544 | # Idea is to reserve for option to produce "universal" binary and let |
| 545 | # programmer detect if current CPU is VIS capable at run-time. |
| 546 | sub unvis3 { |
| 547 | my ($mnemonic,$rs1,$rs2,$rd)=@_; |
| 548 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); |
| 549 | my ($ref,$opf); |
| 550 | my %visopf = ( "addxc" => 0x011, |
| 551 | "addxccc" => 0x013, |
| 552 | "xmulx" => 0x115, |
| 553 | "xmulxhi" => 0x116 ); |
| 554 | |
| 555 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; |
| 556 | |
| 557 | if ($opf=$visopf{$mnemonic}) { |
| 558 | foreach ($rs1,$rs2,$rd) { |
| 559 | return $ref if (!/%([goli])([0-9])/); |
| 560 | $_=$bias{$1}+$2; |
| 561 | } |
| 562 | |
| 563 | return sprintf ".word\t0x%08x !%s", |
| 564 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, |
| 565 | $ref; |
| 566 | } else { |
| 567 | return $ref; |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | foreach (split("\n",$code)) { |
| 572 | s/\`([^\`]*)\`/eval $1/ge; |
| 573 | |
| 574 | s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ |
| 575 | &unvis3($1,$2,$3,$4) |
| 576 | /ge; |
| 577 | |
| 578 | print $_,"\n"; |
| 579 | } |
| 580 | |
| 581 | close STDOUT or die "error closing STDOUT: $!"; |