yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | # |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | # |
| 17 | # This module implements Poly1305 hash for PowerPC FPU. |
| 18 | # |
| 19 | # June 2015 |
| 20 | # |
| 21 | # Numbers are cycles per processed byte with poly1305_blocks alone, |
| 22 | # and improvement coefficients relative to gcc-generated code. |
| 23 | # |
| 24 | # Freescale e300 9.78/+30% |
| 25 | # PPC74x0 6.92/+50% |
| 26 | # PPC970 6.03/+80% |
| 27 | # POWER7 3.50/+30% |
| 28 | # POWER8 3.75/+10% |
| 29 | |
| 30 | $flavour = shift; |
| 31 | |
| 32 | if ($flavour =~ /64/) { |
| 33 | $SIZE_T =8; |
| 34 | $LRSAVE =2*$SIZE_T; |
| 35 | $UCMP ="cmpld"; |
| 36 | $STU ="stdu"; |
| 37 | $POP ="ld"; |
| 38 | $PUSH ="std"; |
| 39 | } elsif ($flavour =~ /32/) { |
| 40 | $SIZE_T =4; |
| 41 | $LRSAVE =$SIZE_T; |
| 42 | $UCMP ="cmplw"; |
| 43 | $STU ="stwu"; |
| 44 | $POP ="lwz"; |
| 45 | $PUSH ="stw"; |
| 46 | } else { die "nonsense $flavour"; } |
| 47 | |
| 48 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; |
| 49 | |
| 50 | $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx"; |
| 51 | |
| 52 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 53 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 54 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| 55 | die "can't locate ppc-xlate.pl"; |
| 56 | |
| 57 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 58 | |
| 59 | $LOCALS=6*$SIZE_T; |
| 60 | $FRAME=$LOCALS+6*8+18*8; |
| 61 | |
| 62 | my $sp="r1"; |
| 63 | |
| 64 | my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); |
| 65 | my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6)); |
| 66 | |
| 67 | my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, |
| 68 | $two0,$two32,$two64,$two96,$two130,$five_two130, |
| 69 | $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, |
| 70 | $s2lo,$s2hi,$s3lo,$s3hi, |
| 71 | $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31)); |
| 72 | # borrowings |
| 73 | my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); |
| 74 | my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); |
| 75 | my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi); |
| 76 | |
| 77 | $code.=<<___; |
| 78 | .machine "any" |
| 79 | .text |
| 80 | |
| 81 | .globl .poly1305_init_fpu |
| 82 | .align 6 |
| 83 | .poly1305_init_fpu: |
| 84 | $STU $sp,-$LOCALS($sp) # minimal frame |
| 85 | mflr $padbit |
| 86 | $PUSH $padbit,`$LOCALS+$LRSAVE`($sp) |
| 87 | |
| 88 | bl LPICmeup |
| 89 | |
| 90 | xor r0,r0,r0 |
| 91 | mtlr $padbit # restore lr |
| 92 | |
| 93 | lfd $two0,8*0($len) # load constants |
| 94 | lfd $two32,8*1($len) |
| 95 | lfd $two64,8*2($len) |
| 96 | lfd $two96,8*3($len) |
| 97 | lfd $two130,8*4($len) |
| 98 | lfd $five_two130,8*5($len) |
| 99 | |
| 100 | stfd $two0,8*0($ctx) # initial hash value, biased 0 |
| 101 | stfd $two32,8*1($ctx) |
| 102 | stfd $two64,8*2($ctx) |
| 103 | stfd $two96,8*3($ctx) |
| 104 | |
| 105 | $UCMP $inp,r0 |
| 106 | beq- Lno_key |
| 107 | |
| 108 | lfd $h3lo,8*13($len) # new fpscr |
| 109 | mffs $h3hi # old fpscr |
| 110 | |
| 111 | stfd $two0,8*4($ctx) # key "template" |
| 112 | stfd $two32,8*5($ctx) |
| 113 | stfd $two64,8*6($ctx) |
| 114 | stfd $two96,8*7($ctx) |
| 115 | |
| 116 | li $in1,4 |
| 117 | li $in2,8 |
| 118 | li $in3,12 |
| 119 | $LWXLE $in0,0,$inp # load key |
| 120 | $LWXLE $in1,$in1,$inp |
| 121 | $LWXLE $in2,$in2,$inp |
| 122 | $LWXLE $in3,$in3,$inp |
| 123 | |
| 124 | lis $i1,0xf000 # 0xf0000000 |
| 125 | ori $i2,$i1,3 # 0xf0000003 |
| 126 | andc $in0,$in0,$i1 # &=0x0fffffff |
| 127 | andc $in1,$in1,$i2 # &=0x0ffffffc |
| 128 | andc $in2,$in2,$i2 |
| 129 | andc $in3,$in3,$i2 |
| 130 | |
| 131 | stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template" |
| 132 | stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx) |
| 133 | stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx) |
| 134 | stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx) |
| 135 | |
| 136 | mtfsf 255,$h3lo # fpscr |
| 137 | stfd $two0,8*18($ctx) # copy constants to context |
| 138 | stfd $two32,8*19($ctx) |
| 139 | stfd $two64,8*20($ctx) |
| 140 | stfd $two96,8*21($ctx) |
| 141 | stfd $two130,8*22($ctx) |
| 142 | stfd $five_two130,8*23($ctx) |
| 143 | |
| 144 | lfd $h0lo,8*4($ctx) # load [biased] key |
| 145 | lfd $h1lo,8*5($ctx) |
| 146 | lfd $h2lo,8*6($ctx) |
| 147 | lfd $h3lo,8*7($ctx) |
| 148 | |
| 149 | fsub $h0lo,$h0lo,$two0 # r0 |
| 150 | fsub $h1lo,$h1lo,$two32 # r1 |
| 151 | fsub $h2lo,$h2lo,$two64 # r2 |
| 152 | fsub $h3lo,$h3lo,$two96 # r3 |
| 153 | |
| 154 | lfd $two0,8*6($len) # more constants |
| 155 | lfd $two32,8*7($len) |
| 156 | lfd $two64,8*8($len) |
| 157 | lfd $two96,8*9($len) |
| 158 | |
| 159 | fmul $h1hi,$h1lo,$five_two130 # s1 |
| 160 | fmul $h2hi,$h2lo,$five_two130 # s2 |
| 161 | stfd $h3hi,8*15($ctx) # borrow slot for original fpscr |
| 162 | fmul $h3hi,$h3lo,$five_two130 # s3 |
| 163 | |
| 164 | fadd $h0hi,$h0lo,$two0 |
| 165 | stfd $h1hi,8*12($ctx) # put aside for now |
| 166 | fadd $h1hi,$h1lo,$two32 |
| 167 | stfd $h2hi,8*13($ctx) |
| 168 | fadd $h2hi,$h2lo,$two64 |
| 169 | stfd $h3hi,8*14($ctx) |
| 170 | fadd $h3hi,$h3lo,$two96 |
| 171 | |
| 172 | fsub $h0hi,$h0hi,$two0 |
| 173 | fsub $h1hi,$h1hi,$two32 |
| 174 | fsub $h2hi,$h2hi,$two64 |
| 175 | fsub $h3hi,$h3hi,$two96 |
| 176 | |
| 177 | lfd $two0,8*10($len) # more constants |
| 178 | lfd $two32,8*11($len) |
| 179 | lfd $two64,8*12($len) |
| 180 | |
| 181 | fsub $h0lo,$h0lo,$h0hi |
| 182 | fsub $h1lo,$h1lo,$h1hi |
| 183 | fsub $h2lo,$h2lo,$h2hi |
| 184 | fsub $h3lo,$h3lo,$h3hi |
| 185 | |
| 186 | stfd $h0hi,8*5($ctx) # r0hi |
| 187 | stfd $h1hi,8*7($ctx) # r1hi |
| 188 | stfd $h2hi,8*9($ctx) # r2hi |
| 189 | stfd $h3hi,8*11($ctx) # r3hi |
| 190 | |
| 191 | stfd $h0lo,8*4($ctx) # r0lo |
| 192 | stfd $h1lo,8*6($ctx) # r1lo |
| 193 | stfd $h2lo,8*8($ctx) # r2lo |
| 194 | stfd $h3lo,8*10($ctx) # r3lo |
| 195 | |
| 196 | lfd $h1lo,8*12($ctx) # s1 |
| 197 | lfd $h2lo,8*13($ctx) # s2 |
| 198 | lfd $h3lo,8*14($ctx) # s3 |
| 199 | lfd $h0lo,8*15($ctx) # pull original fpscr |
| 200 | |
| 201 | fadd $h1hi,$h1lo,$two0 |
| 202 | fadd $h2hi,$h2lo,$two32 |
| 203 | fadd $h3hi,$h3lo,$two64 |
| 204 | |
| 205 | fsub $h1hi,$h1hi,$two0 |
| 206 | fsub $h2hi,$h2hi,$two32 |
| 207 | fsub $h3hi,$h3hi,$two64 |
| 208 | |
| 209 | fsub $h1lo,$h1lo,$h1hi |
| 210 | fsub $h2lo,$h2lo,$h2hi |
| 211 | fsub $h3lo,$h3lo,$h3hi |
| 212 | |
| 213 | stfd $h1hi,8*13($ctx) # s1hi |
| 214 | stfd $h2hi,8*15($ctx) # s2hi |
| 215 | stfd $h3hi,8*17($ctx) # s3hi |
| 216 | |
| 217 | stfd $h1lo,8*12($ctx) # s1lo |
| 218 | stfd $h2lo,8*14($ctx) # s2lo |
| 219 | stfd $h3lo,8*16($ctx) # s3lo |
| 220 | |
| 221 | mtfsf 255,$h0lo # restore fpscr |
| 222 | Lno_key: |
| 223 | xor r3,r3,r3 |
| 224 | addi $sp,$sp,$LOCALS |
| 225 | blr |
| 226 | .long 0 |
| 227 | .byte 0,12,4,1,0x80,0,2,0 |
| 228 | .size .poly1305_init_fpu,.-.poly1305_init_fpu |
| 229 | |
| 230 | .globl .poly1305_blocks_fpu |
| 231 | .align 4 |
| 232 | .poly1305_blocks_fpu: |
| 233 | srwi. $len,$len,4 |
| 234 | beq- Labort |
| 235 | |
| 236 | $STU $sp,-$FRAME($sp) |
| 237 | mflr r0 |
| 238 | stfd f14,`$FRAME-8*18`($sp) |
| 239 | stfd f15,`$FRAME-8*17`($sp) |
| 240 | stfd f16,`$FRAME-8*16`($sp) |
| 241 | stfd f17,`$FRAME-8*15`($sp) |
| 242 | stfd f18,`$FRAME-8*14`($sp) |
| 243 | stfd f19,`$FRAME-8*13`($sp) |
| 244 | stfd f20,`$FRAME-8*12`($sp) |
| 245 | stfd f21,`$FRAME-8*11`($sp) |
| 246 | stfd f22,`$FRAME-8*10`($sp) |
| 247 | stfd f23,`$FRAME-8*9`($sp) |
| 248 | stfd f24,`$FRAME-8*8`($sp) |
| 249 | stfd f25,`$FRAME-8*7`($sp) |
| 250 | stfd f26,`$FRAME-8*6`($sp) |
| 251 | stfd f27,`$FRAME-8*5`($sp) |
| 252 | stfd f28,`$FRAME-8*4`($sp) |
| 253 | stfd f29,`$FRAME-8*3`($sp) |
| 254 | stfd f30,`$FRAME-8*2`($sp) |
| 255 | stfd f31,`$FRAME-8*1`($sp) |
| 256 | $PUSH r0,`$FRAME+$LRSAVE`($sp) |
| 257 | |
| 258 | xor r0,r0,r0 |
| 259 | li $in3,1 |
| 260 | mtctr $len |
| 261 | neg $len,$len |
| 262 | stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp) |
| 263 | stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp) |
| 264 | |
| 265 | lfd $two0,8*18($ctx) # load constants |
| 266 | lfd $two32,8*19($ctx) |
| 267 | lfd $two64,8*20($ctx) |
| 268 | lfd $two96,8*21($ctx) |
| 269 | lfd $two130,8*22($ctx) |
| 270 | lfd $five_two130,8*23($ctx) |
| 271 | |
| 272 | lfd $h0lo,8*0($ctx) # load [biased] hash value |
| 273 | lfd $h1lo,8*1($ctx) |
| 274 | lfd $h2lo,8*2($ctx) |
| 275 | lfd $h3lo,8*3($ctx) |
| 276 | |
| 277 | stfd $two0,`$LOCALS+8*0`($sp) # input "template" |
| 278 | oris $in3,$padbit,`(1023+52+96)<<4` |
| 279 | stfd $two32,`$LOCALS+8*1`($sp) |
| 280 | stfd $two64,`$LOCALS+8*2`($sp) |
| 281 | stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp) |
| 282 | |
| 283 | li $i1,4 |
| 284 | li $i2,8 |
| 285 | li $i3,12 |
| 286 | $LWXLE $in0,0,$inp # load input |
| 287 | $LWXLE $in1,$i1,$inp |
| 288 | $LWXLE $in2,$i2,$inp |
| 289 | $LWXLE $in3,$i3,$inp |
| 290 | addi $inp,$inp,16 |
| 291 | |
| 292 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" |
| 293 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) |
| 294 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) |
| 295 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) |
| 296 | |
| 297 | mffs $x0 # original fpscr |
| 298 | lfd $x1,`$LOCALS+8*4`($sp) # new fpscr |
| 299 | lfd $r0lo,8*4($ctx) # load key |
| 300 | lfd $r0hi,8*5($ctx) |
| 301 | lfd $r1lo,8*6($ctx) |
| 302 | lfd $r1hi,8*7($ctx) |
| 303 | lfd $r2lo,8*8($ctx) |
| 304 | lfd $r2hi,8*9($ctx) |
| 305 | lfd $r3lo,8*10($ctx) |
| 306 | lfd $r3hi,8*11($ctx) |
| 307 | lfd $s1lo,8*12($ctx) |
| 308 | lfd $s1hi,8*13($ctx) |
| 309 | lfd $s2lo,8*14($ctx) |
| 310 | lfd $s2hi,8*15($ctx) |
| 311 | lfd $s3lo,8*16($ctx) |
| 312 | lfd $s3hi,8*17($ctx) |
| 313 | |
| 314 | stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr |
| 315 | mtfsf 255,$x1 |
| 316 | |
| 317 | addic $len,$len,1 |
| 318 | addze r0,r0 |
| 319 | slwi. r0,r0,4 |
| 320 | sub $inp,$inp,r0 # conditional rewind |
| 321 | |
| 322 | lfd $x0,`$LOCALS+8*0`($sp) |
| 323 | lfd $x1,`$LOCALS+8*1`($sp) |
| 324 | lfd $x2,`$LOCALS+8*2`($sp) |
| 325 | lfd $x3,`$LOCALS+8*3`($sp) |
| 326 | |
| 327 | fsub $h0lo,$h0lo,$two0 # de-bias hash value |
| 328 | $LWXLE $in0,0,$inp # modulo-scheduled input load |
| 329 | fsub $h1lo,$h1lo,$two32 |
| 330 | $LWXLE $in1,$i1,$inp |
| 331 | fsub $h2lo,$h2lo,$two64 |
| 332 | $LWXLE $in2,$i2,$inp |
| 333 | fsub $h3lo,$h3lo,$two96 |
| 334 | $LWXLE $in3,$i3,$inp |
| 335 | |
| 336 | fsub $x0,$x0,$two0 # de-bias input |
| 337 | addi $inp,$inp,16 |
| 338 | fsub $x1,$x1,$two32 |
| 339 | fsub $x2,$x2,$two64 |
| 340 | fsub $x3,$x3,$two96 |
| 341 | |
| 342 | fadd $x0,$x0,$h0lo # accumulate input |
| 343 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) |
| 344 | fadd $x1,$x1,$h1lo |
| 345 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) |
| 346 | fadd $x2,$x2,$h2lo |
| 347 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) |
| 348 | fadd $x3,$x3,$h3lo |
| 349 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) |
| 350 | |
| 351 | b Lentry |
| 352 | |
| 353 | .align 4 |
| 354 | Loop: |
| 355 | fsub $y0,$y0,$two0 # de-bias input |
| 356 | addic $len,$len,1 |
| 357 | fsub $y1,$y1,$two32 |
| 358 | addze r0,r0 |
| 359 | fsub $y2,$y2,$two64 |
| 360 | slwi. r0,r0,4 |
| 361 | fsub $y3,$y3,$two96 |
| 362 | sub $inp,$inp,r0 # conditional rewind |
| 363 | |
| 364 | fadd $h0lo,$h0lo,$y0 # accumulate input |
| 365 | fadd $h0hi,$h0hi,$y1 |
| 366 | fadd $h2lo,$h2lo,$y2 |
| 367 | fadd $h2hi,$h2hi,$y3 |
| 368 | |
| 369 | ######################################### base 2^48 -> base 2^32 |
| 370 | fadd $c1lo,$h1lo,$two64 |
| 371 | $LWXLE $in0,0,$inp # modulo-scheduled input load |
| 372 | fadd $c1hi,$h1hi,$two64 |
| 373 | $LWXLE $in1,$i1,$inp |
| 374 | fadd $c3lo,$h3lo,$two130 |
| 375 | $LWXLE $in2,$i2,$inp |
| 376 | fadd $c3hi,$h3hi,$two130 |
| 377 | $LWXLE $in3,$i3,$inp |
| 378 | fadd $c0lo,$h0lo,$two32 |
| 379 | addi $inp,$inp,16 |
| 380 | fadd $c0hi,$h0hi,$two32 |
| 381 | fadd $c2lo,$h2lo,$two96 |
| 382 | fadd $c2hi,$h2hi,$two96 |
| 383 | |
| 384 | fsub $c1lo,$c1lo,$two64 |
| 385 | stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" |
| 386 | fsub $c1hi,$c1hi,$two64 |
| 387 | stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) |
| 388 | fsub $c3lo,$c3lo,$two130 |
| 389 | stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) |
| 390 | fsub $c3hi,$c3hi,$two130 |
| 391 | stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) |
| 392 | fsub $c0lo,$c0lo,$two32 |
| 393 | fsub $c0hi,$c0hi,$two32 |
| 394 | fsub $c2lo,$c2lo,$two96 |
| 395 | fsub $c2hi,$c2hi,$two96 |
| 396 | |
| 397 | fsub $h1lo,$h1lo,$c1lo |
| 398 | fsub $h1hi,$h1hi,$c1hi |
| 399 | fsub $h3lo,$h3lo,$c3lo |
| 400 | fsub $h3hi,$h3hi,$c3hi |
| 401 | fsub $h2lo,$h2lo,$c2lo |
| 402 | fsub $h2hi,$h2hi,$c2hi |
| 403 | fsub $h0lo,$h0lo,$c0lo |
| 404 | fsub $h0hi,$h0hi,$c0hi |
| 405 | |
| 406 | fadd $h1lo,$h1lo,$c0lo |
| 407 | fadd $h1hi,$h1hi,$c0hi |
| 408 | fadd $h3lo,$h3lo,$c2lo |
| 409 | fadd $h3hi,$h3hi,$c2hi |
| 410 | fadd $h2lo,$h2lo,$c1lo |
| 411 | fadd $h2hi,$h2hi,$c1hi |
| 412 | fmadd $h0lo,$c3lo,$five_two130,$h0lo |
| 413 | fmadd $h0hi,$c3hi,$five_two130,$h0hi |
| 414 | |
| 415 | fadd $x1,$h1lo,$h1hi |
| 416 | lfd $s1lo,8*12($ctx) # reload constants |
| 417 | fadd $x3,$h3lo,$h3hi |
| 418 | lfd $s1hi,8*13($ctx) |
| 419 | fadd $x2,$h2lo,$h2hi |
| 420 | lfd $r3lo,8*10($ctx) |
| 421 | fadd $x0,$h0lo,$h0hi |
| 422 | lfd $r3hi,8*11($ctx) |
| 423 | Lentry: |
| 424 | fmul $h0lo,$s3lo,$x1 |
| 425 | fmul $h0hi,$s3hi,$x1 |
| 426 | fmul $h2lo,$r1lo,$x1 |
| 427 | fmul $h2hi,$r1hi,$x1 |
| 428 | fmul $h1lo,$r0lo,$x1 |
| 429 | fmul $h1hi,$r0hi,$x1 |
| 430 | fmul $h3lo,$r2lo,$x1 |
| 431 | fmul $h3hi,$r2hi,$x1 |
| 432 | |
| 433 | fmadd $h0lo,$s1lo,$x3,$h0lo |
| 434 | fmadd $h0hi,$s1hi,$x3,$h0hi |
| 435 | fmadd $h2lo,$s3lo,$x3,$h2lo |
| 436 | fmadd $h2hi,$s3hi,$x3,$h2hi |
| 437 | fmadd $h1lo,$s2lo,$x3,$h1lo |
| 438 | fmadd $h1hi,$s2hi,$x3,$h1hi |
| 439 | fmadd $h3lo,$r0lo,$x3,$h3lo |
| 440 | fmadd $h3hi,$r0hi,$x3,$h3hi |
| 441 | |
| 442 | fmadd $h0lo,$s2lo,$x2,$h0lo |
| 443 | fmadd $h0hi,$s2hi,$x2,$h0hi |
| 444 | fmadd $h2lo,$r0lo,$x2,$h2lo |
| 445 | fmadd $h2hi,$r0hi,$x2,$h2hi |
| 446 | fmadd $h1lo,$s3lo,$x2,$h1lo |
| 447 | fmadd $h1hi,$s3hi,$x2,$h1hi |
| 448 | fmadd $h3lo,$r1lo,$x2,$h3lo |
| 449 | fmadd $h3hi,$r1hi,$x2,$h3hi |
| 450 | |
| 451 | fmadd $h0lo,$r0lo,$x0,$h0lo |
| 452 | lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input |
| 453 | fmadd $h0hi,$r0hi,$x0,$h0hi |
| 454 | lfd $y1,`$LOCALS+8*1`($sp) |
| 455 | fmadd $h2lo,$r2lo,$x0,$h2lo |
| 456 | lfd $y2,`$LOCALS+8*2`($sp) |
| 457 | fmadd $h2hi,$r2hi,$x0,$h2hi |
| 458 | lfd $y3,`$LOCALS+8*3`($sp) |
| 459 | fmadd $h1lo,$r1lo,$x0,$h1lo |
| 460 | fmadd $h1hi,$r1hi,$x0,$h1hi |
| 461 | fmadd $h3lo,$r3lo,$x0,$h3lo |
| 462 | fmadd $h3hi,$r3hi,$x0,$h3hi |
| 463 | |
| 464 | bdnz Loop |
| 465 | |
| 466 | ######################################### base 2^48 -> base 2^32 |
| 467 | fadd $c0lo,$h0lo,$two32 |
| 468 | fadd $c0hi,$h0hi,$two32 |
| 469 | fadd $c2lo,$h2lo,$two96 |
| 470 | fadd $c2hi,$h2hi,$two96 |
| 471 | fadd $c1lo,$h1lo,$two64 |
| 472 | fadd $c1hi,$h1hi,$two64 |
| 473 | fadd $c3lo,$h3lo,$two130 |
| 474 | fadd $c3hi,$h3hi,$two130 |
| 475 | |
| 476 | fsub $c0lo,$c0lo,$two32 |
| 477 | fsub $c0hi,$c0hi,$two32 |
| 478 | fsub $c2lo,$c2lo,$two96 |
| 479 | fsub $c2hi,$c2hi,$two96 |
| 480 | fsub $c1lo,$c1lo,$two64 |
| 481 | fsub $c1hi,$c1hi,$two64 |
| 482 | fsub $c3lo,$c3lo,$two130 |
| 483 | fsub $c3hi,$c3hi,$two130 |
| 484 | |
| 485 | fsub $h1lo,$h1lo,$c1lo |
| 486 | fsub $h1hi,$h1hi,$c1hi |
| 487 | fsub $h3lo,$h3lo,$c3lo |
| 488 | fsub $h3hi,$h3hi,$c3hi |
| 489 | fsub $h2lo,$h2lo,$c2lo |
| 490 | fsub $h2hi,$h2hi,$c2hi |
| 491 | fsub $h0lo,$h0lo,$c0lo |
| 492 | fsub $h0hi,$h0hi,$c0hi |
| 493 | |
| 494 | fadd $h1lo,$h1lo,$c0lo |
| 495 | fadd $h1hi,$h1hi,$c0hi |
| 496 | fadd $h3lo,$h3lo,$c2lo |
| 497 | fadd $h3hi,$h3hi,$c2hi |
| 498 | fadd $h2lo,$h2lo,$c1lo |
| 499 | fadd $h2hi,$h2hi,$c1hi |
| 500 | fmadd $h0lo,$c3lo,$five_two130,$h0lo |
| 501 | fmadd $h0hi,$c3hi,$five_two130,$h0hi |
| 502 | |
| 503 | fadd $x1,$h1lo,$h1hi |
| 504 | fadd $x3,$h3lo,$h3hi |
| 505 | fadd $x2,$h2lo,$h2hi |
| 506 | fadd $x0,$h0lo,$h0hi |
| 507 | |
| 508 | lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr |
| 509 | fadd $x1,$x1,$two32 # bias |
| 510 | fadd $x3,$x3,$two96 |
| 511 | fadd $x2,$x2,$two64 |
| 512 | fadd $x0,$x0,$two0 |
| 513 | |
| 514 | stfd $x1,8*1($ctx) # store [biased] hash value |
| 515 | stfd $x3,8*3($ctx) |
| 516 | stfd $x2,8*2($ctx) |
| 517 | stfd $x0,8*0($ctx) |
| 518 | |
| 519 | mtfsf 255,$h0lo # restore original fpscr |
| 520 | lfd f14,`$FRAME-8*18`($sp) |
| 521 | lfd f15,`$FRAME-8*17`($sp) |
| 522 | lfd f16,`$FRAME-8*16`($sp) |
| 523 | lfd f17,`$FRAME-8*15`($sp) |
| 524 | lfd f18,`$FRAME-8*14`($sp) |
| 525 | lfd f19,`$FRAME-8*13`($sp) |
| 526 | lfd f20,`$FRAME-8*12`($sp) |
| 527 | lfd f21,`$FRAME-8*11`($sp) |
| 528 | lfd f22,`$FRAME-8*10`($sp) |
| 529 | lfd f23,`$FRAME-8*9`($sp) |
| 530 | lfd f24,`$FRAME-8*8`($sp) |
| 531 | lfd f25,`$FRAME-8*7`($sp) |
| 532 | lfd f26,`$FRAME-8*6`($sp) |
| 533 | lfd f27,`$FRAME-8*5`($sp) |
| 534 | lfd f28,`$FRAME-8*4`($sp) |
| 535 | lfd f29,`$FRAME-8*3`($sp) |
| 536 | lfd f30,`$FRAME-8*2`($sp) |
| 537 | lfd f31,`$FRAME-8*1`($sp) |
| 538 | addi $sp,$sp,$FRAME |
| 539 | Labort: |
| 540 | blr |
| 541 | .long 0 |
| 542 | .byte 0,12,4,1,0x80,0,4,0 |
| 543 | .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu |
| 544 | ___ |
| 545 | { |
| 546 | my ($mac,$nonce)=($inp,$len); |
| 547 | |
| 548 | my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3 |
| 549 | ) = map("r$_",(7..11,28..31)); |
| 550 | my $mask = "r0"; |
| 551 | my $FRAME = (6+4)*$SIZE_T; |
| 552 | |
| 553 | $code.=<<___; |
| 554 | .globl .poly1305_emit_fpu |
| 555 | .align 4 |
| 556 | .poly1305_emit_fpu: |
| 557 | $STU $sp,-$FRAME($sp) |
| 558 | mflr r0 |
| 559 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) |
| 560 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 561 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 562 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 563 | $PUSH r0,`$FRAME+$LRSAVE`($sp) |
| 564 | |
| 565 | lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash |
| 566 | lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx) |
| 567 | lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx) |
| 568 | lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx) |
| 569 | lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx) |
| 570 | lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx) |
| 571 | lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx) |
| 572 | lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx) |
| 573 | |
| 574 | lis $mask,0xfff0 |
| 575 | andc $d0,$d0,$mask # mask exponent |
| 576 | andc $d1,$d1,$mask |
| 577 | andc $d2,$d2,$mask |
| 578 | andc $d3,$d3,$mask # can be partially reduced... |
| 579 | li $mask,3 |
| 580 | |
| 581 | srwi $padbit,$d3,2 # ... so reduce |
| 582 | and $h4,$d3,$mask |
| 583 | andc $d3,$d3,$mask |
| 584 | add $d3,$d3,$padbit |
| 585 | ___ |
| 586 | if ($SIZE_T==4) { |
| 587 | $code.=<<___; |
| 588 | addc $h0,$h0,$d3 |
| 589 | adde $h1,$h1,$d0 |
| 590 | adde $h2,$h2,$d1 |
| 591 | adde $h3,$h3,$d2 |
| 592 | addze $h4,$h4 |
| 593 | |
| 594 | addic $d0,$h0,5 # compare to modulus |
| 595 | addze $d1,$h1 |
| 596 | addze $d2,$h2 |
| 597 | addze $d3,$h3 |
| 598 | addze $mask,$h4 |
| 599 | |
| 600 | srwi $mask,$mask,2 # did it carry/borrow? |
| 601 | neg $mask,$mask |
| 602 | srawi $mask,$mask,31 # mask |
| 603 | |
| 604 | andc $h0,$h0,$mask |
| 605 | and $d0,$d0,$mask |
| 606 | andc $h1,$h1,$mask |
| 607 | and $d1,$d1,$mask |
| 608 | or $h0,$h0,$d0 |
| 609 | lwz $d0,0($nonce) # load nonce |
| 610 | andc $h2,$h2,$mask |
| 611 | and $d2,$d2,$mask |
| 612 | or $h1,$h1,$d1 |
| 613 | lwz $d1,4($nonce) |
| 614 | andc $h3,$h3,$mask |
| 615 | and $d3,$d3,$mask |
| 616 | or $h2,$h2,$d2 |
| 617 | lwz $d2,8($nonce) |
| 618 | or $h3,$h3,$d3 |
| 619 | lwz $d3,12($nonce) |
| 620 | |
| 621 | addc $h0,$h0,$d0 # accumulate nonce |
| 622 | adde $h1,$h1,$d1 |
| 623 | adde $h2,$h2,$d2 |
| 624 | adde $h3,$h3,$d3 |
| 625 | ___ |
| 626 | } else { |
| 627 | $code.=<<___; |
| 628 | add $h0,$h0,$d3 |
| 629 | add $h1,$h1,$d0 |
| 630 | add $h2,$h2,$d1 |
| 631 | add $h3,$h3,$d2 |
| 632 | |
| 633 | srdi $d0,$h0,32 |
| 634 | add $h1,$h1,$d0 |
| 635 | srdi $d1,$h1,32 |
| 636 | add $h2,$h2,$d1 |
| 637 | srdi $d2,$h2,32 |
| 638 | add $h3,$h3,$d2 |
| 639 | srdi $d3,$h3,32 |
| 640 | add $h4,$h4,$d3 |
| 641 | |
| 642 | insrdi $h0,$h1,32,0 |
| 643 | insrdi $h2,$h3,32,0 |
| 644 | |
| 645 | addic $d0,$h0,5 # compare to modulus |
| 646 | addze $d1,$h2 |
| 647 | addze $d2,$h4 |
| 648 | |
| 649 | srdi $mask,$d2,2 # did it carry/borrow? |
| 650 | neg $mask,$mask |
| 651 | sradi $mask,$mask,63 # mask |
| 652 | ld $d2,0($nonce) # load nonce |
| 653 | ld $d3,8($nonce) |
| 654 | |
| 655 | andc $h0,$h0,$mask |
| 656 | and $d0,$d0,$mask |
| 657 | andc $h2,$h2,$mask |
| 658 | and $d1,$d1,$mask |
| 659 | or $h0,$h0,$d0 |
| 660 | or $h2,$h2,$d1 |
| 661 | ___ |
| 662 | $code.=<<___ if (!$LITTLE_ENDIAN); |
| 663 | rotldi $d2,$d2,32 # flip nonce words |
| 664 | rotldi $d3,$d3,32 |
| 665 | ___ |
| 666 | $code.=<<___; |
| 667 | addc $h0,$h0,$d2 # accumulate nonce |
| 668 | adde $h2,$h2,$d3 |
| 669 | |
| 670 | srdi $h1,$h0,32 |
| 671 | srdi $h3,$h2,32 |
| 672 | ___ |
| 673 | } |
| 674 | $code.=<<___ if ($LITTLE_ENDIAN); |
| 675 | stw $h0,0($mac) # write result |
| 676 | stw $h1,4($mac) |
| 677 | stw $h2,8($mac) |
| 678 | stw $h3,12($mac) |
| 679 | ___ |
| 680 | $code.=<<___ if (!$LITTLE_ENDIAN); |
| 681 | li $d1,4 |
| 682 | stwbrx $h0,0,$mac # write result |
| 683 | li $d2,8 |
| 684 | stwbrx $h1,$d1,$mac |
| 685 | li $d3,12 |
| 686 | stwbrx $h2,$d2,$mac |
| 687 | stwbrx $h3,$d3,$mac |
| 688 | ___ |
| 689 | $code.=<<___; |
| 690 | $POP r28,`$FRAME-$SIZE_T*4`($sp) |
| 691 | $POP r29,`$FRAME-$SIZE_T*3`($sp) |
| 692 | $POP r30,`$FRAME-$SIZE_T*2`($sp) |
| 693 | $POP r31,`$FRAME-$SIZE_T*1`($sp) |
| 694 | addi $sp,$sp,$FRAME |
| 695 | blr |
| 696 | .long 0 |
| 697 | .byte 0,12,4,1,0x80,4,3,0 |
| 698 | .size .poly1305_emit_fpu,.-.poly1305_emit_fpu |
| 699 | ___ |
| 700 | } |
| 701 | # Ugly hack here, because PPC assembler syntax seem to vary too |
| 702 | # much from platforms to platform... |
| 703 | $code.=<<___; |
| 704 | .align 6 |
| 705 | LPICmeup: |
| 706 | mflr r0 |
| 707 | bcl 20,31,\$+4 |
| 708 | mflr $len # vvvvvv "distance" between . and 1st data entry |
| 709 | addi $len,$len,`64-8` # borrow $len |
| 710 | mtlr r0 |
| 711 | blr |
| 712 | .long 0 |
| 713 | .byte 0,12,0x14,0,0,0,0,0 |
| 714 | .space `64-9*4` |
| 715 | |
| 716 | .quad 0x4330000000000000 # 2^(52+0) |
| 717 | .quad 0x4530000000000000 # 2^(52+32) |
| 718 | .quad 0x4730000000000000 # 2^(52+64) |
| 719 | .quad 0x4930000000000000 # 2^(52+96) |
| 720 | .quad 0x4b50000000000000 # 2^(52+130) |
| 721 | |
| 722 | .quad 0x37f4000000000000 # 5/2^130 |
| 723 | |
| 724 | .quad 0x4430000000000000 # 2^(52+16+0) |
| 725 | .quad 0x4630000000000000 # 2^(52+16+32) |
| 726 | .quad 0x4830000000000000 # 2^(52+16+64) |
| 727 | .quad 0x4a30000000000000 # 2^(52+16+96) |
| 728 | .quad 0x3e30000000000000 # 2^(52+16+0-96) |
| 729 | .quad 0x4030000000000000 # 2^(52+16+32-96) |
| 730 | .quad 0x4230000000000000 # 2^(52+16+64-96) |
| 731 | |
| 732 | .quad 0x0000000000000001 # fpscr: truncate, no exceptions |
| 733 | .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>" |
| 734 | .align 4 |
| 735 | ___ |
| 736 | |
| 737 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| 738 | print $code; |
| 739 | close STDOUT or die "error closing STDOUT: $!"; |