yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | # |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | # |
| 17 | # October 2015 |
| 18 | # |
| 19 | # ChaCha20 for PowerPC/AltiVec. |
| 20 | # |
| 21 | # June 2018 |
| 22 | # |
| 23 | # Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for |
| 24 | # processors that can't issue more than one vector instruction per |
| 25 | # cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x |
| 26 | # interleave would perform better. Incidentally PowerISA 2.07 (first |
| 27 | # implemented by POWER8) defined new usable instructions, hence 4xVSX |
| 28 | # code path... |
| 29 | # |
| 30 | # Performance in cycles per byte out of large buffer. |
| 31 | # |
| 32 | # IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX |
| 33 | # |
| 34 | # Freescale e300 13.6/+115% - - |
| 35 | # PPC74x0/G4e 6.81/+310% 3.81 - |
| 36 | # PPC970/G5 9.29/+160% ? - |
| 37 | # POWER7 8.62/+61% 3.35 - |
| 38 | # POWER8 8.70/+51% 2.91 2.09 |
| 39 | # POWER9 8.80/+29% 4.44(*) 2.45(**) |
| 40 | # |
| 41 | # (*) this is trade-off result, it's possible to improve it, but |
| 42 | # then it would negatively affect all others; |
| 43 | # (**) POWER9 seems to be "allergic" to mixing vector and integer |
| 44 | # instructions, which is why switch to vector-only code pays |
| 45 | # off that much; |
| 46 | |
| 47 | $flavour = shift; |
| 48 | |
| 49 | if ($flavour =~ /64/) { |
| 50 | $SIZE_T =8; |
| 51 | $LRSAVE =2*$SIZE_T; |
| 52 | $STU ="stdu"; |
| 53 | $POP ="ld"; |
| 54 | $PUSH ="std"; |
| 55 | $UCMP ="cmpld"; |
| 56 | } elsif ($flavour =~ /32/) { |
| 57 | $SIZE_T =4; |
| 58 | $LRSAVE =$SIZE_T; |
| 59 | $STU ="stwu"; |
| 60 | $POP ="lwz"; |
| 61 | $PUSH ="stw"; |
| 62 | $UCMP ="cmplw"; |
| 63 | } else { die "nonsense $flavour"; } |
| 64 | |
| 65 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; |
| 66 | |
| 67 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 68 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 69 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| 70 | die "can't locate ppc-xlate.pl"; |
| 71 | |
| 72 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 73 | |
| 74 | $LOCALS=6*$SIZE_T; |
| 75 | $FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables |
| 76 | |
| 77 | sub AUTOLOAD() # thunk [simplified] x86-style perlasm |
| 78 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; |
| 79 | $code .= "\t$opcode\t".join(',',@_)."\n"; |
| 80 | } |
| 81 | |
| 82 | my $sp = "r1"; |
| 83 | |
| 84 | my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); |
| 85 | |
| 86 | my @x=map("r$_",(16..31)); |
| 87 | my @d=map("r$_",(11,12,14,15)); |
| 88 | my @t=map("r$_",(7..10)); |
| 89 | |
| 90 | sub ROUND { |
| 91 | my ($a0,$b0,$c0,$d0)=@_; |
| 92 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); |
| 93 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); |
| 94 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); |
| 95 | |
| 96 | ( |
| 97 | "&add (@x[$a0],@x[$a0],@x[$b0])", |
| 98 | "&add (@x[$a1],@x[$a1],@x[$b1])", |
| 99 | "&add (@x[$a2],@x[$a2],@x[$b2])", |
| 100 | "&add (@x[$a3],@x[$a3],@x[$b3])", |
| 101 | "&xor (@x[$d0],@x[$d0],@x[$a0])", |
| 102 | "&xor (@x[$d1],@x[$d1],@x[$a1])", |
| 103 | "&xor (@x[$d2],@x[$d2],@x[$a2])", |
| 104 | "&xor (@x[$d3],@x[$d3],@x[$a3])", |
| 105 | "&rotlwi (@x[$d0],@x[$d0],16)", |
| 106 | "&rotlwi (@x[$d1],@x[$d1],16)", |
| 107 | "&rotlwi (@x[$d2],@x[$d2],16)", |
| 108 | "&rotlwi (@x[$d3],@x[$d3],16)", |
| 109 | |
| 110 | "&add (@x[$c0],@x[$c0],@x[$d0])", |
| 111 | "&add (@x[$c1],@x[$c1],@x[$d1])", |
| 112 | "&add (@x[$c2],@x[$c2],@x[$d2])", |
| 113 | "&add (@x[$c3],@x[$c3],@x[$d3])", |
| 114 | "&xor (@x[$b0],@x[$b0],@x[$c0])", |
| 115 | "&xor (@x[$b1],@x[$b1],@x[$c1])", |
| 116 | "&xor (@x[$b2],@x[$b2],@x[$c2])", |
| 117 | "&xor (@x[$b3],@x[$b3],@x[$c3])", |
| 118 | "&rotlwi (@x[$b0],@x[$b0],12)", |
| 119 | "&rotlwi (@x[$b1],@x[$b1],12)", |
| 120 | "&rotlwi (@x[$b2],@x[$b2],12)", |
| 121 | "&rotlwi (@x[$b3],@x[$b3],12)", |
| 122 | |
| 123 | "&add (@x[$a0],@x[$a0],@x[$b0])", |
| 124 | "&add (@x[$a1],@x[$a1],@x[$b1])", |
| 125 | "&add (@x[$a2],@x[$a2],@x[$b2])", |
| 126 | "&add (@x[$a3],@x[$a3],@x[$b3])", |
| 127 | "&xor (@x[$d0],@x[$d0],@x[$a0])", |
| 128 | "&xor (@x[$d1],@x[$d1],@x[$a1])", |
| 129 | "&xor (@x[$d2],@x[$d2],@x[$a2])", |
| 130 | "&xor (@x[$d3],@x[$d3],@x[$a3])", |
| 131 | "&rotlwi (@x[$d0],@x[$d0],8)", |
| 132 | "&rotlwi (@x[$d1],@x[$d1],8)", |
| 133 | "&rotlwi (@x[$d2],@x[$d2],8)", |
| 134 | "&rotlwi (@x[$d3],@x[$d3],8)", |
| 135 | |
| 136 | "&add (@x[$c0],@x[$c0],@x[$d0])", |
| 137 | "&add (@x[$c1],@x[$c1],@x[$d1])", |
| 138 | "&add (@x[$c2],@x[$c2],@x[$d2])", |
| 139 | "&add (@x[$c3],@x[$c3],@x[$d3])", |
| 140 | "&xor (@x[$b0],@x[$b0],@x[$c0])", |
| 141 | "&xor (@x[$b1],@x[$b1],@x[$c1])", |
| 142 | "&xor (@x[$b2],@x[$b2],@x[$c2])", |
| 143 | "&xor (@x[$b3],@x[$b3],@x[$c3])", |
| 144 | "&rotlwi (@x[$b0],@x[$b0],7)", |
| 145 | "&rotlwi (@x[$b1],@x[$b1],7)", |
| 146 | "&rotlwi (@x[$b2],@x[$b2],7)", |
| 147 | "&rotlwi (@x[$b3],@x[$b3],7)" |
| 148 | ); |
| 149 | } |
| 150 | |
| 151 | $code.=<<___; |
| 152 | .machine "any" |
| 153 | .text |
| 154 | |
| 155 | .globl .ChaCha20_ctr32_int |
| 156 | .align 5 |
| 157 | .ChaCha20_ctr32_int: |
| 158 | __ChaCha20_ctr32_int: |
| 159 | ${UCMP}i $len,0 |
| 160 | beqlr- |
| 161 | |
| 162 | $STU $sp,-$FRAME($sp) |
| 163 | mflr r0 |
| 164 | |
| 165 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| 166 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
| 167 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
| 168 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
| 169 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) |
| 170 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) |
| 171 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) |
| 172 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) |
| 173 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) |
| 174 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) |
| 175 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) |
| 176 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) |
| 177 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) |
| 178 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) |
| 179 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) |
| 180 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 181 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 182 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 183 | $PUSH r0,`$FRAME+$LRSAVE`($sp) |
| 184 | |
| 185 | lwz @d[0],0($ctr) # load counter |
| 186 | lwz @d[1],4($ctr) |
| 187 | lwz @d[2],8($ctr) |
| 188 | lwz @d[3],12($ctr) |
| 189 | |
| 190 | bl __ChaCha20_1x |
| 191 | |
| 192 | $POP r0,`$FRAME+$LRSAVE`($sp) |
| 193 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
| 194 | $POP r15,`$FRAME-$SIZE_T*17`($sp) |
| 195 | $POP r16,`$FRAME-$SIZE_T*16`($sp) |
| 196 | $POP r17,`$FRAME-$SIZE_T*15`($sp) |
| 197 | $POP r18,`$FRAME-$SIZE_T*14`($sp) |
| 198 | $POP r19,`$FRAME-$SIZE_T*13`($sp) |
| 199 | $POP r20,`$FRAME-$SIZE_T*12`($sp) |
| 200 | $POP r21,`$FRAME-$SIZE_T*11`($sp) |
| 201 | $POP r22,`$FRAME-$SIZE_T*10`($sp) |
| 202 | $POP r23,`$FRAME-$SIZE_T*9`($sp) |
| 203 | $POP r24,`$FRAME-$SIZE_T*8`($sp) |
| 204 | $POP r25,`$FRAME-$SIZE_T*7`($sp) |
| 205 | $POP r26,`$FRAME-$SIZE_T*6`($sp) |
| 206 | $POP r27,`$FRAME-$SIZE_T*5`($sp) |
| 207 | $POP r28,`$FRAME-$SIZE_T*4`($sp) |
| 208 | $POP r29,`$FRAME-$SIZE_T*3`($sp) |
| 209 | $POP r30,`$FRAME-$SIZE_T*2`($sp) |
| 210 | $POP r31,`$FRAME-$SIZE_T*1`($sp) |
| 211 | mtlr r0 |
| 212 | addi $sp,$sp,$FRAME |
| 213 | blr |
| 214 | .long 0 |
| 215 | .byte 0,12,4,1,0x80,18,5,0 |
| 216 | .long 0 |
| 217 | .size .ChaCha20_ctr32_int,.-.ChaCha20_ctr32_int |
| 218 | |
| 219 | .align 5 |
| 220 | __ChaCha20_1x: |
| 221 | Loop_outer: |
| 222 | lis @x[0],0x6170 # synthesize sigma |
| 223 | lis @x[1],0x3320 |
| 224 | lis @x[2],0x7962 |
| 225 | lis @x[3],0x6b20 |
| 226 | ori @x[0],@x[0],0x7865 |
| 227 | ori @x[1],@x[1],0x646e |
| 228 | ori @x[2],@x[2],0x2d32 |
| 229 | ori @x[3],@x[3],0x6574 |
| 230 | |
| 231 | li r0,10 # inner loop counter |
| 232 | lwz @x[4],0($key) # load key |
| 233 | lwz @x[5],4($key) |
| 234 | lwz @x[6],8($key) |
| 235 | lwz @x[7],12($key) |
| 236 | lwz @x[8],16($key) |
| 237 | mr @x[12],@d[0] # copy counter |
| 238 | lwz @x[9],20($key) |
| 239 | mr @x[13],@d[1] |
| 240 | lwz @x[10],24($key) |
| 241 | mr @x[14],@d[2] |
| 242 | lwz @x[11],28($key) |
| 243 | mr @x[15],@d[3] |
| 244 | |
| 245 | mr @t[0],@x[4] |
| 246 | mr @t[1],@x[5] |
| 247 | mr @t[2],@x[6] |
| 248 | mr @t[3],@x[7] |
| 249 | |
| 250 | mtctr r0 |
| 251 | Loop: |
| 252 | ___ |
| 253 | foreach (&ROUND(0, 4, 8,12)) { eval; } |
| 254 | foreach (&ROUND(0, 5,10,15)) { eval; } |
| 255 | $code.=<<___; |
| 256 | bdnz Loop |
| 257 | |
| 258 | subic $len,$len,64 # $len-=64 |
| 259 | addi @x[0],@x[0],0x7865 # accumulate key block |
| 260 | addi @x[1],@x[1],0x646e |
| 261 | addi @x[2],@x[2],0x2d32 |
| 262 | addi @x[3],@x[3],0x6574 |
| 263 | addis @x[0],@x[0],0x6170 |
| 264 | addis @x[1],@x[1],0x3320 |
| 265 | addis @x[2],@x[2],0x7962 |
| 266 | addis @x[3],@x[3],0x6b20 |
| 267 | |
| 268 | subfe. r0,r0,r0 # borrow?-1:0 |
| 269 | add @x[4],@x[4],@t[0] |
| 270 | lwz @t[0],16($key) |
| 271 | add @x[5],@x[5],@t[1] |
| 272 | lwz @t[1],20($key) |
| 273 | add @x[6],@x[6],@t[2] |
| 274 | lwz @t[2],24($key) |
| 275 | add @x[7],@x[7],@t[3] |
| 276 | lwz @t[3],28($key) |
| 277 | add @x[8],@x[8],@t[0] |
| 278 | add @x[9],@x[9],@t[1] |
| 279 | add @x[10],@x[10],@t[2] |
| 280 | add @x[11],@x[11],@t[3] |
| 281 | |
| 282 | add @x[12],@x[12],@d[0] |
| 283 | add @x[13],@x[13],@d[1] |
| 284 | add @x[14],@x[14],@d[2] |
| 285 | add @x[15],@x[15],@d[3] |
| 286 | addi @d[0],@d[0],1 # increment counter |
| 287 | ___ |
| 288 | if (!$LITTLE_ENDIAN) { for($i=0;$i<16;$i++) { # flip byte order |
| 289 | $code.=<<___; |
| 290 | mr @t[$i&3],@x[$i] |
| 291 | rotlwi @x[$i],@x[$i],8 |
| 292 | rlwimi @x[$i],@t[$i&3],24,0,7 |
| 293 | rlwimi @x[$i],@t[$i&3],24,16,23 |
| 294 | ___ |
| 295 | } } |
| 296 | $code.=<<___; |
| 297 | bne Ltail # $len-=64 borrowed |
| 298 | |
| 299 | lwz @t[0],0($inp) # load input, aligned or not |
| 300 | lwz @t[1],4($inp) |
| 301 | ${UCMP}i $len,0 # done already? |
| 302 | lwz @t[2],8($inp) |
| 303 | lwz @t[3],12($inp) |
| 304 | xor @x[0],@x[0],@t[0] # xor with input |
| 305 | lwz @t[0],16($inp) |
| 306 | xor @x[1],@x[1],@t[1] |
| 307 | lwz @t[1],20($inp) |
| 308 | xor @x[2],@x[2],@t[2] |
| 309 | lwz @t[2],24($inp) |
| 310 | xor @x[3],@x[3],@t[3] |
| 311 | lwz @t[3],28($inp) |
| 312 | xor @x[4],@x[4],@t[0] |
| 313 | lwz @t[0],32($inp) |
| 314 | xor @x[5],@x[5],@t[1] |
| 315 | lwz @t[1],36($inp) |
| 316 | xor @x[6],@x[6],@t[2] |
| 317 | lwz @t[2],40($inp) |
| 318 | xor @x[7],@x[7],@t[3] |
| 319 | lwz @t[3],44($inp) |
| 320 | xor @x[8],@x[8],@t[0] |
| 321 | lwz @t[0],48($inp) |
| 322 | xor @x[9],@x[9],@t[1] |
| 323 | lwz @t[1],52($inp) |
| 324 | xor @x[10],@x[10],@t[2] |
| 325 | lwz @t[2],56($inp) |
| 326 | xor @x[11],@x[11],@t[3] |
| 327 | lwz @t[3],60($inp) |
| 328 | xor @x[12],@x[12],@t[0] |
| 329 | stw @x[0],0($out) # store output, aligned or not |
| 330 | xor @x[13],@x[13],@t[1] |
| 331 | stw @x[1],4($out) |
| 332 | xor @x[14],@x[14],@t[2] |
| 333 | stw @x[2],8($out) |
| 334 | xor @x[15],@x[15],@t[3] |
| 335 | stw @x[3],12($out) |
| 336 | stw @x[4],16($out) |
| 337 | stw @x[5],20($out) |
| 338 | stw @x[6],24($out) |
| 339 | stw @x[7],28($out) |
| 340 | stw @x[8],32($out) |
| 341 | stw @x[9],36($out) |
| 342 | stw @x[10],40($out) |
| 343 | stw @x[11],44($out) |
| 344 | stw @x[12],48($out) |
| 345 | stw @x[13],52($out) |
| 346 | stw @x[14],56($out) |
| 347 | addi $inp,$inp,64 |
| 348 | stw @x[15],60($out) |
| 349 | addi $out,$out,64 |
| 350 | |
| 351 | bne Loop_outer |
| 352 | |
| 353 | blr |
| 354 | |
| 355 | .align 4 |
| 356 | Ltail: |
| 357 | addi $len,$len,64 # restore tail length |
| 358 | subi $inp,$inp,1 # prepare for *++ptr |
| 359 | subi $out,$out,1 |
| 360 | addi @t[0],$sp,$LOCALS-1 |
| 361 | mtctr $len |
| 362 | |
| 363 | stw @x[0],`$LOCALS+0`($sp) # save whole block to stack |
| 364 | stw @x[1],`$LOCALS+4`($sp) |
| 365 | stw @x[2],`$LOCALS+8`($sp) |
| 366 | stw @x[3],`$LOCALS+12`($sp) |
| 367 | stw @x[4],`$LOCALS+16`($sp) |
| 368 | stw @x[5],`$LOCALS+20`($sp) |
| 369 | stw @x[6],`$LOCALS+24`($sp) |
| 370 | stw @x[7],`$LOCALS+28`($sp) |
| 371 | stw @x[8],`$LOCALS+32`($sp) |
| 372 | stw @x[9],`$LOCALS+36`($sp) |
| 373 | stw @x[10],`$LOCALS+40`($sp) |
| 374 | stw @x[11],`$LOCALS+44`($sp) |
| 375 | stw @x[12],`$LOCALS+48`($sp) |
| 376 | stw @x[13],`$LOCALS+52`($sp) |
| 377 | stw @x[14],`$LOCALS+56`($sp) |
| 378 | stw @x[15],`$LOCALS+60`($sp) |
| 379 | |
| 380 | Loop_tail: # byte-by-byte loop |
| 381 | lbzu @d[0],1($inp) |
| 382 | lbzu @x[0],1(@t[0]) |
| 383 | xor @d[1],@d[0],@x[0] |
| 384 | stbu @d[1],1($out) |
| 385 | bdnz Loop_tail |
| 386 | |
| 387 | stw $sp,`$LOCALS+0`($sp) # wipe block on stack |
| 388 | stw $sp,`$LOCALS+4`($sp) |
| 389 | stw $sp,`$LOCALS+8`($sp) |
| 390 | stw $sp,`$LOCALS+12`($sp) |
| 391 | stw $sp,`$LOCALS+16`($sp) |
| 392 | stw $sp,`$LOCALS+20`($sp) |
| 393 | stw $sp,`$LOCALS+24`($sp) |
| 394 | stw $sp,`$LOCALS+28`($sp) |
| 395 | stw $sp,`$LOCALS+32`($sp) |
| 396 | stw $sp,`$LOCALS+36`($sp) |
| 397 | stw $sp,`$LOCALS+40`($sp) |
| 398 | stw $sp,`$LOCALS+44`($sp) |
| 399 | stw $sp,`$LOCALS+48`($sp) |
| 400 | stw $sp,`$LOCALS+52`($sp) |
| 401 | stw $sp,`$LOCALS+56`($sp) |
| 402 | stw $sp,`$LOCALS+60`($sp) |
| 403 | |
| 404 | blr |
| 405 | .long 0 |
| 406 | .byte 0,12,0x14,0,0,0,0,0 |
| 407 | ___ |
| 408 | |
| 409 | {{{ |
| 410 | my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2) |
| 411 | = map("v$_",(0..11)); |
| 412 | my @K = map("v$_",(12..17)); |
| 413 | my ($FOUR,$sixteen,$twenty4) = map("v$_",(18..19,23)); |
| 414 | my ($inpperm,$outperm,$outmask) = map("v$_",(24..26)); |
| 415 | my @D = map("v$_",(27..31)); |
| 416 | my ($twelve,$seven,$T0,$T1) = @D; |
| 417 | |
| 418 | my $FRAME=$LOCALS+64+10*16+18*$SIZE_T; # 10*16 is for v23-v31 offload |
| 419 | |
| 420 | sub VMXROUND { |
| 421 | my $odd = pop; |
| 422 | my ($a,$b,$c,$d)=@_; |
| 423 | |
| 424 | ( |
| 425 | "&vadduwm ('$a','$a','$b')", |
| 426 | "&vxor ('$d','$d','$a')", |
| 427 | "&vperm ('$d','$d','$d','$sixteen')", |
| 428 | |
| 429 | "&vadduwm ('$c','$c','$d')", |
| 430 | "&vxor ('$b','$b','$c')", |
| 431 | "&vrlw ('$b','$b','$twelve')", |
| 432 | |
| 433 | "&vadduwm ('$a','$a','$b')", |
| 434 | "&vxor ('$d','$d','$a')", |
| 435 | "&vperm ('$d','$d','$d','$twenty4')", |
| 436 | |
| 437 | "&vadduwm ('$c','$c','$d')", |
| 438 | "&vxor ('$b','$b','$c')", |
| 439 | "&vrlw ('$b','$b','$seven')", |
| 440 | |
| 441 | "&vrldoi ('$c','$c',8)", |
| 442 | "&vrldoi ('$b','$b',$odd?4:12)", |
| 443 | "&vrldoi ('$d','$d',$odd?12:4)" |
| 444 | ); |
| 445 | } |
| 446 | |
| 447 | $code.=<<___; |
| 448 | |
| 449 | .globl .ChaCha20_ctr32_vmx |
| 450 | .align 5 |
| 451 | .ChaCha20_ctr32_vmx: |
| 452 | ${UCMP}i $len,256 |
| 453 | blt __ChaCha20_ctr32_int |
| 454 | |
| 455 | $STU $sp,-$FRAME($sp) |
| 456 | mflr r0 |
| 457 | li r10,`15+$LOCALS+64` |
| 458 | li r11,`31+$LOCALS+64` |
| 459 | mfspr r12,256 |
| 460 | stvx v23,r10,$sp |
| 461 | addi r10,r10,32 |
| 462 | stvx v24,r11,$sp |
| 463 | addi r11,r11,32 |
| 464 | stvx v25,r10,$sp |
| 465 | addi r10,r10,32 |
| 466 | stvx v26,r11,$sp |
| 467 | addi r11,r11,32 |
| 468 | stvx v27,r10,$sp |
| 469 | addi r10,r10,32 |
| 470 | stvx v28,r11,$sp |
| 471 | addi r11,r11,32 |
| 472 | stvx v29,r10,$sp |
| 473 | addi r10,r10,32 |
| 474 | stvx v30,r11,$sp |
| 475 | stvx v31,r10,$sp |
| 476 | stw r12,`$FRAME-$SIZE_T*18-4`($sp) # save vrsave |
| 477 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| 478 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
| 479 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
| 480 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
| 481 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) |
| 482 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) |
| 483 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) |
| 484 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) |
| 485 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) |
| 486 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) |
| 487 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) |
| 488 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) |
| 489 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) |
| 490 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) |
| 491 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) |
| 492 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 493 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 494 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 495 | li r12,-4096+511 |
| 496 | $PUSH r0, `$FRAME+$LRSAVE`($sp) |
| 497 | mtspr 256,r12 # preserve 29 AltiVec registers |
| 498 | |
| 499 | bl Lconsts # returns pointer Lsigma in r12 |
| 500 | li @x[0],16 |
| 501 | li @x[1],32 |
| 502 | li @x[2],48 |
| 503 | li @x[3],64 |
| 504 | li @x[4],31 # 31 is not a typo |
| 505 | li @x[5],15 # nor is 15 |
| 506 | |
| 507 | lvx @K[1],0,$key # load key |
| 508 | ?lvsr $T0,0,$key # prepare unaligned load |
| 509 | lvx @K[2],@x[0],$key |
| 510 | lvx @D[0],@x[4],$key |
| 511 | |
| 512 | lvx @K[3],0,$ctr # load counter |
| 513 | ?lvsr $T1,0,$ctr # prepare unaligned load |
| 514 | lvx @D[1],@x[5],$ctr |
| 515 | |
| 516 | lvx @K[0],0,r12 # load constants |
| 517 | lvx @K[5],@x[0],r12 # one |
| 518 | lvx $FOUR,@x[1],r12 |
| 519 | lvx $sixteen,@x[2],r12 |
| 520 | lvx $twenty4,@x[3],r12 |
| 521 | |
| 522 | ?vperm @K[1],@K[2],@K[1],$T0 # align key |
| 523 | ?vperm @K[2],@D[0],@K[2],$T0 |
| 524 | ?vperm @K[3],@D[1],@K[3],$T1 # align counter |
| 525 | |
| 526 | lwz @d[0],0($ctr) # load counter to GPR |
| 527 | lwz @d[1],4($ctr) |
| 528 | vadduwm @K[3],@K[3],@K[5] # adjust AltiVec counter |
| 529 | lwz @d[2],8($ctr) |
| 530 | vadduwm @K[4],@K[3],@K[5] |
| 531 | lwz @d[3],12($ctr) |
| 532 | vadduwm @K[5],@K[4],@K[5] |
| 533 | |
| 534 | vxor $T0,$T0,$T0 # 0x00..00 |
| 535 | vspltisw $outmask,-1 # 0xff..ff |
| 536 | ?lvsr $inpperm,0,$inp # prepare for unaligned load |
| 537 | ?lvsl $outperm,0,$out # prepare for unaligned store |
| 538 | ?vperm $outmask,$outmask,$T0,$outperm |
| 539 | |
| 540 | be?lvsl $T0,0,@x[0] # 0x00..0f |
| 541 | be?vspltisb $T1,3 # 0x03..03 |
| 542 | be?vxor $T0,$T0,$T1 # swap bytes within words |
| 543 | be?vxor $outperm,$outperm,$T1 |
| 544 | be?vperm $inpperm,$inpperm,$inpperm,$T0 |
| 545 | |
| 546 | li r0,10 # inner loop counter |
| 547 | b Loop_outer_vmx |
| 548 | |
| 549 | .align 4 |
| 550 | Loop_outer_vmx: |
| 551 | lis @x[0],0x6170 # synthesize sigma |
| 552 | lis @x[1],0x3320 |
| 553 | vmr $A0,@K[0] |
| 554 | lis @x[2],0x7962 |
| 555 | lis @x[3],0x6b20 |
| 556 | vmr $A1,@K[0] |
| 557 | ori @x[0],@x[0],0x7865 |
| 558 | ori @x[1],@x[1],0x646e |
| 559 | vmr $A2,@K[0] |
| 560 | ori @x[2],@x[2],0x2d32 |
| 561 | ori @x[3],@x[3],0x6574 |
| 562 | vmr $B0,@K[1] |
| 563 | |
| 564 | lwz @x[4],0($key) # load key to GPR |
| 565 | vmr $B1,@K[1] |
| 566 | lwz @x[5],4($key) |
| 567 | vmr $B2,@K[1] |
| 568 | lwz @x[6],8($key) |
| 569 | vmr $C0,@K[2] |
| 570 | lwz @x[7],12($key) |
| 571 | vmr $C1,@K[2] |
| 572 | lwz @x[8],16($key) |
| 573 | vmr $C2,@K[2] |
| 574 | mr @x[12],@d[0] # copy GPR counter |
| 575 | lwz @x[9],20($key) |
| 576 | vmr $D0,@K[3] |
| 577 | mr @x[13],@d[1] |
| 578 | lwz @x[10],24($key) |
| 579 | vmr $D1,@K[4] |
| 580 | mr @x[14],@d[2] |
| 581 | lwz @x[11],28($key) |
| 582 | vmr $D2,@K[5] |
| 583 | mr @x[15],@d[3] |
| 584 | |
| 585 | mr @t[0],@x[4] |
| 586 | mr @t[1],@x[5] |
| 587 | mr @t[2],@x[6] |
| 588 | mr @t[3],@x[7] |
| 589 | |
| 590 | vspltisw $twelve,12 # synthesize constants |
| 591 | vspltisw $seven,7 |
| 592 | |
| 593 | mtctr r0 |
| 594 | nop |
| 595 | Loop_vmx: |
| 596 | ___ |
| 597 | my @thread0=&VMXROUND($A0,$B0,$C0,$D0,0); |
| 598 | my @thread1=&VMXROUND($A1,$B1,$C1,$D1,0); |
| 599 | my @thread2=&VMXROUND($A2,$B2,$C2,$D2,0); |
| 600 | my @thread3=&ROUND(0,4,8,12); |
| 601 | |
| 602 | foreach (@thread0) { |
| 603 | eval; |
| 604 | eval(shift(@thread1)); |
| 605 | eval(shift(@thread2)); |
| 606 | |
| 607 | eval(shift(@thread3)); |
| 608 | eval(shift(@thread3)); |
| 609 | eval(shift(@thread3)); |
| 610 | } |
| 611 | foreach (@thread3) { eval; } |
| 612 | |
| 613 | @thread0=&VMXROUND($A0,$B0,$C0,$D0,1); |
| 614 | @thread1=&VMXROUND($A1,$B1,$C1,$D1,1); |
| 615 | @thread2=&VMXROUND($A2,$B2,$C2,$D2,1); |
| 616 | @thread3=&ROUND(0,5,10,15); |
| 617 | |
| 618 | foreach (@thread0) { |
| 619 | eval; |
| 620 | eval(shift(@thread1)); |
| 621 | eval(shift(@thread2)); |
| 622 | |
| 623 | eval(shift(@thread3)); |
| 624 | eval(shift(@thread3)); |
| 625 | eval(shift(@thread3)); |
| 626 | } |
| 627 | foreach (@thread3) { eval; } |
| 628 | $code.=<<___; |
| 629 | bdnz Loop_vmx |
| 630 | |
| 631 | subi $len,$len,256 # $len-=256 |
| 632 | addi @x[0],@x[0],0x7865 # accumulate key block |
| 633 | addi @x[1],@x[1],0x646e |
| 634 | addi @x[2],@x[2],0x2d32 |
| 635 | addi @x[3],@x[3],0x6574 |
| 636 | addis @x[0],@x[0],0x6170 |
| 637 | addis @x[1],@x[1],0x3320 |
| 638 | addis @x[2],@x[2],0x7962 |
| 639 | addis @x[3],@x[3],0x6b20 |
| 640 | add @x[4],@x[4],@t[0] |
| 641 | lwz @t[0],16($key) |
| 642 | add @x[5],@x[5],@t[1] |
| 643 | lwz @t[1],20($key) |
| 644 | add @x[6],@x[6],@t[2] |
| 645 | lwz @t[2],24($key) |
| 646 | add @x[7],@x[7],@t[3] |
| 647 | lwz @t[3],28($key) |
| 648 | add @x[8],@x[8],@t[0] |
| 649 | add @x[9],@x[9],@t[1] |
| 650 | add @x[10],@x[10],@t[2] |
| 651 | add @x[11],@x[11],@t[3] |
| 652 | add @x[12],@x[12],@d[0] |
| 653 | add @x[13],@x[13],@d[1] |
| 654 | add @x[14],@x[14],@d[2] |
| 655 | add @x[15],@x[15],@d[3] |
| 656 | |
| 657 | vadduwm $A0,$A0,@K[0] # accumulate key block |
| 658 | vadduwm $A1,$A1,@K[0] |
| 659 | vadduwm $A2,$A2,@K[0] |
| 660 | vadduwm $B0,$B0,@K[1] |
| 661 | vadduwm $B1,$B1,@K[1] |
| 662 | vadduwm $B2,$B2,@K[1] |
| 663 | vadduwm $C0,$C0,@K[2] |
| 664 | vadduwm $C1,$C1,@K[2] |
| 665 | vadduwm $C2,$C2,@K[2] |
| 666 | vadduwm $D0,$D0,@K[3] |
| 667 | vadduwm $D1,$D1,@K[4] |
| 668 | vadduwm $D2,$D2,@K[5] |
| 669 | |
| 670 | addi @d[0],@d[0],4 # increment counter |
| 671 | vadduwm @K[3],@K[3],$FOUR |
| 672 | vadduwm @K[4],@K[4],$FOUR |
| 673 | vadduwm @K[5],@K[5],$FOUR |
| 674 | |
| 675 | ___ |
| 676 | if (!$LITTLE_ENDIAN) { for($i=0;$i<16;$i++) { # flip byte order |
| 677 | $code.=<<___; |
| 678 | mr @t[$i&3],@x[$i] |
| 679 | rotlwi @x[$i],@x[$i],8 |
| 680 | rlwimi @x[$i],@t[$i&3],24,0,7 |
| 681 | rlwimi @x[$i],@t[$i&3],24,16,23 |
| 682 | ___ |
| 683 | } } |
| 684 | $code.=<<___; |
| 685 | lwz @t[0],0($inp) # load input, aligned or not |
| 686 | lwz @t[1],4($inp) |
| 687 | lwz @t[2],8($inp) |
| 688 | lwz @t[3],12($inp) |
| 689 | xor @x[0],@x[0],@t[0] # xor with input |
| 690 | lwz @t[0],16($inp) |
| 691 | xor @x[1],@x[1],@t[1] |
| 692 | lwz @t[1],20($inp) |
| 693 | xor @x[2],@x[2],@t[2] |
| 694 | lwz @t[2],24($inp) |
| 695 | xor @x[3],@x[3],@t[3] |
| 696 | lwz @t[3],28($inp) |
| 697 | xor @x[4],@x[4],@t[0] |
| 698 | lwz @t[0],32($inp) |
| 699 | xor @x[5],@x[5],@t[1] |
| 700 | lwz @t[1],36($inp) |
| 701 | xor @x[6],@x[6],@t[2] |
| 702 | lwz @t[2],40($inp) |
| 703 | xor @x[7],@x[7],@t[3] |
| 704 | lwz @t[3],44($inp) |
| 705 | xor @x[8],@x[8],@t[0] |
| 706 | lwz @t[0],48($inp) |
| 707 | xor @x[9],@x[9],@t[1] |
| 708 | lwz @t[1],52($inp) |
| 709 | xor @x[10],@x[10],@t[2] |
| 710 | lwz @t[2],56($inp) |
| 711 | xor @x[11],@x[11],@t[3] |
| 712 | lwz @t[3],60($inp) |
| 713 | xor @x[12],@x[12],@t[0] |
| 714 | stw @x[0],0($out) # store output, aligned or not |
| 715 | xor @x[13],@x[13],@t[1] |
| 716 | stw @x[1],4($out) |
| 717 | xor @x[14],@x[14],@t[2] |
| 718 | stw @x[2],8($out) |
| 719 | xor @x[15],@x[15],@t[3] |
| 720 | stw @x[3],12($out) |
| 721 | addi $inp,$inp,64 |
| 722 | stw @x[4],16($out) |
| 723 | li @t[0],16 |
| 724 | stw @x[5],20($out) |
| 725 | li @t[1],32 |
| 726 | stw @x[6],24($out) |
| 727 | li @t[2],48 |
| 728 | stw @x[7],28($out) |
| 729 | li @t[3],64 |
| 730 | stw @x[8],32($out) |
| 731 | stw @x[9],36($out) |
| 732 | stw @x[10],40($out) |
| 733 | stw @x[11],44($out) |
| 734 | stw @x[12],48($out) |
| 735 | stw @x[13],52($out) |
| 736 | stw @x[14],56($out) |
| 737 | stw @x[15],60($out) |
| 738 | addi $out,$out,64 |
| 739 | |
| 740 | lvx @D[0],0,$inp # load input |
| 741 | lvx @D[1],@t[0],$inp |
| 742 | lvx @D[2],@t[1],$inp |
| 743 | lvx @D[3],@t[2],$inp |
| 744 | lvx @D[4],@t[3],$inp |
| 745 | addi $inp,$inp,64 |
| 746 | |
| 747 | ?vperm @D[0],@D[1],@D[0],$inpperm # align input |
| 748 | ?vperm @D[1],@D[2],@D[1],$inpperm |
| 749 | ?vperm @D[2],@D[3],@D[2],$inpperm |
| 750 | ?vperm @D[3],@D[4],@D[3],$inpperm |
| 751 | vxor $A0,$A0,@D[0] # xor with input |
| 752 | vxor $B0,$B0,@D[1] |
| 753 | lvx @D[1],@t[0],$inp # keep loading input |
| 754 | vxor $C0,$C0,@D[2] |
| 755 | lvx @D[2],@t[1],$inp |
| 756 | vxor $D0,$D0,@D[3] |
| 757 | lvx @D[3],@t[2],$inp |
| 758 | lvx @D[0],@t[3],$inp |
| 759 | addi $inp,$inp,64 |
| 760 | li @t[3],63 # 63 is not a typo |
| 761 | vperm $A0,$A0,$A0,$outperm # pre-misalign output |
| 762 | vperm $B0,$B0,$B0,$outperm |
| 763 | vperm $C0,$C0,$C0,$outperm |
| 764 | vperm $D0,$D0,$D0,$outperm |
| 765 | |
| 766 | ?vperm @D[4],@D[1],@D[4],$inpperm # align input |
| 767 | ?vperm @D[1],@D[2],@D[1],$inpperm |
| 768 | ?vperm @D[2],@D[3],@D[2],$inpperm |
| 769 | ?vperm @D[3],@D[0],@D[3],$inpperm |
| 770 | vxor $A1,$A1,@D[4] |
| 771 | vxor $B1,$B1,@D[1] |
| 772 | lvx @D[1],@t[0],$inp # keep loading input |
| 773 | vxor $C1,$C1,@D[2] |
| 774 | lvx @D[2],@t[1],$inp |
| 775 | vxor $D1,$D1,@D[3] |
| 776 | lvx @D[3],@t[2],$inp |
| 777 | lvx @D[4],@t[3],$inp # redundant in aligned case |
| 778 | addi $inp,$inp,64 |
| 779 | vperm $A1,$A1,$A1,$outperm # pre-misalign output |
| 780 | vperm $B1,$B1,$B1,$outperm |
| 781 | vperm $C1,$C1,$C1,$outperm |
| 782 | vperm $D1,$D1,$D1,$outperm |
| 783 | |
| 784 | ?vperm @D[0],@D[1],@D[0],$inpperm # align input |
| 785 | ?vperm @D[1],@D[2],@D[1],$inpperm |
| 786 | ?vperm @D[2],@D[3],@D[2],$inpperm |
| 787 | ?vperm @D[3],@D[4],@D[3],$inpperm |
| 788 | vxor $A2,$A2,@D[0] |
| 789 | vxor $B2,$B2,@D[1] |
| 790 | vxor $C2,$C2,@D[2] |
| 791 | vxor $D2,$D2,@D[3] |
| 792 | vperm $A2,$A2,$A2,$outperm # pre-misalign output |
| 793 | vperm $B2,$B2,$B2,$outperm |
| 794 | vperm $C2,$C2,$C2,$outperm |
| 795 | vperm $D2,$D2,$D2,$outperm |
| 796 | |
| 797 | andi. @x[1],$out,15 # is $out aligned? |
| 798 | mr @x[0],$out |
| 799 | |
| 800 | vsel @D[0],$A0,$B0,$outmask # collect pre-misaligned output |
| 801 | vsel @D[1],$B0,$C0,$outmask |
| 802 | vsel @D[2],$C0,$D0,$outmask |
| 803 | vsel @D[3],$D0,$A1,$outmask |
| 804 | vsel $B0,$A1,$B1,$outmask |
| 805 | vsel $C0,$B1,$C1,$outmask |
| 806 | vsel $D0,$C1,$D1,$outmask |
| 807 | vsel $A1,$D1,$A2,$outmask |
| 808 | vsel $B1,$A2,$B2,$outmask |
| 809 | vsel $C1,$B2,$C2,$outmask |
| 810 | vsel $D1,$C2,$D2,$outmask |
| 811 | |
| 812 | #stvx $A0,0,$out # take it easy on the edges |
| 813 | stvx @D[0],@t[0],$out # store output |
| 814 | stvx @D[1],@t[1],$out |
| 815 | stvx @D[2],@t[2],$out |
| 816 | addi $out,$out,64 |
| 817 | stvx @D[3],0,$out |
| 818 | stvx $B0,@t[0],$out |
| 819 | stvx $C0,@t[1],$out |
| 820 | stvx $D0,@t[2],$out |
| 821 | addi $out,$out,64 |
| 822 | stvx $A1,0,$out |
| 823 | stvx $B1,@t[0],$out |
| 824 | stvx $C1,@t[1],$out |
| 825 | stvx $D1,@t[2],$out |
| 826 | addi $out,$out,64 |
| 827 | |
| 828 | beq Laligned_vmx |
| 829 | |
| 830 | sub @x[2],$out,@x[1] # in misaligned case edges |
| 831 | li @x[3],0 # are written byte-by-byte |
| 832 | Lunaligned_tail_vmx: |
| 833 | stvebx $D2,@x[3],@x[2] |
| 834 | addi @x[3],@x[3],1 |
| 835 | cmpw @x[3],@x[1] |
| 836 | bne Lunaligned_tail_vmx |
| 837 | |
| 838 | sub @x[2],@x[0],@x[1] |
| 839 | Lunaligned_head_vmx: |
| 840 | stvebx $A0,@x[1],@x[2] |
| 841 | cmpwi @x[1],15 |
| 842 | addi @x[1],@x[1],1 |
| 843 | bne Lunaligned_head_vmx |
| 844 | |
| 845 | ${UCMP}i $len,255 # done with 256-byte blocks yet? |
| 846 | bgt Loop_outer_vmx |
| 847 | |
| 848 | b Ldone_vmx |
| 849 | |
| 850 | .align 4 |
| 851 | Laligned_vmx: |
| 852 | stvx $A0,0,@x[0] # head hexaword was not stored |
| 853 | |
| 854 | ${UCMP}i $len,255 # done with 256-byte blocks yet? |
| 855 | bgt Loop_outer_vmx |
| 856 | nop |
| 857 | |
| 858 | Ldone_vmx: |
| 859 | ${UCMP}i $len,0 # done yet? |
| 860 | bnel __ChaCha20_1x |
| 861 | |
| 862 | lwz r12,`$FRAME-$SIZE_T*18-4`($sp) # pull vrsave |
| 863 | li r10,`15+$LOCALS+64` |
| 864 | li r11,`31+$LOCALS+64` |
| 865 | mtspr 256,r12 # restore vrsave |
| 866 | lvx v23,r10,$sp |
| 867 | addi r10,r10,32 |
| 868 | lvx v24,r11,$sp |
| 869 | addi r11,r11,32 |
| 870 | lvx v25,r10,$sp |
| 871 | addi r10,r10,32 |
| 872 | lvx v26,r11,$sp |
| 873 | addi r11,r11,32 |
| 874 | lvx v27,r10,$sp |
| 875 | addi r10,r10,32 |
| 876 | lvx v28,r11,$sp |
| 877 | addi r11,r11,32 |
| 878 | lvx v29,r10,$sp |
| 879 | addi r10,r10,32 |
| 880 | lvx v30,r11,$sp |
| 881 | lvx v31,r10,$sp |
| 882 | $POP r0, `$FRAME+$LRSAVE`($sp) |
| 883 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
| 884 | $POP r15,`$FRAME-$SIZE_T*17`($sp) |
| 885 | $POP r16,`$FRAME-$SIZE_T*16`($sp) |
| 886 | $POP r17,`$FRAME-$SIZE_T*15`($sp) |
| 887 | $POP r18,`$FRAME-$SIZE_T*14`($sp) |
| 888 | $POP r19,`$FRAME-$SIZE_T*13`($sp) |
| 889 | $POP r20,`$FRAME-$SIZE_T*12`($sp) |
| 890 | $POP r21,`$FRAME-$SIZE_T*11`($sp) |
| 891 | $POP r22,`$FRAME-$SIZE_T*10`($sp) |
| 892 | $POP r23,`$FRAME-$SIZE_T*9`($sp) |
| 893 | $POP r24,`$FRAME-$SIZE_T*8`($sp) |
| 894 | $POP r25,`$FRAME-$SIZE_T*7`($sp) |
| 895 | $POP r26,`$FRAME-$SIZE_T*6`($sp) |
| 896 | $POP r27,`$FRAME-$SIZE_T*5`($sp) |
| 897 | $POP r28,`$FRAME-$SIZE_T*4`($sp) |
| 898 | $POP r29,`$FRAME-$SIZE_T*3`($sp) |
| 899 | $POP r30,`$FRAME-$SIZE_T*2`($sp) |
| 900 | $POP r31,`$FRAME-$SIZE_T*1`($sp) |
| 901 | mtlr r0 |
| 902 | addi $sp,$sp,$FRAME |
| 903 | blr |
| 904 | .long 0 |
| 905 | .byte 0,12,0x04,1,0x80,18,5,0 |
| 906 | .long 0 |
| 907 | .size .ChaCha20_ctr32_vmx,.-.ChaCha20_ctr32_vmx |
| 908 | ___ |
| 909 | }}} |
| 910 | {{{ |
| 911 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
| 912 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); |
| 913 | my @K = map("v$_",(16..19)); |
| 914 | my $CTR = "v26"; |
| 915 | my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); |
| 916 | my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); |
| 917 | my $beperm = "v31"; |
| 918 | |
| 919 | my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); |
| 920 | |
| 921 | my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload |
| 922 | |
| 923 | sub VSX_lane_ROUND { |
| 924 | my ($a0,$b0,$c0,$d0)=@_; |
| 925 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); |
| 926 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); |
| 927 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); |
| 928 | my @x=map("\"v$_\"",(0..15)); |
| 929 | |
| 930 | ( |
| 931 | "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 |
| 932 | "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 |
| 933 | "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 |
| 934 | "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 |
| 935 | "&vxor (@x[$d0],@x[$d0],@x[$a0])", |
| 936 | "&vxor (@x[$d1],@x[$d1],@x[$a1])", |
| 937 | "&vxor (@x[$d2],@x[$d2],@x[$a2])", |
| 938 | "&vxor (@x[$d3],@x[$d3],@x[$a3])", |
| 939 | "&vrlw (@x[$d0],@x[$d0],'$sixteen')", |
| 940 | "&vrlw (@x[$d1],@x[$d1],'$sixteen')", |
| 941 | "&vrlw (@x[$d2],@x[$d2],'$sixteen')", |
| 942 | "&vrlw (@x[$d3],@x[$d3],'$sixteen')", |
| 943 | |
| 944 | "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", |
| 945 | "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", |
| 946 | "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", |
| 947 | "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", |
| 948 | "&vxor (@x[$b0],@x[$b0],@x[$c0])", |
| 949 | "&vxor (@x[$b1],@x[$b1],@x[$c1])", |
| 950 | "&vxor (@x[$b2],@x[$b2],@x[$c2])", |
| 951 | "&vxor (@x[$b3],@x[$b3],@x[$c3])", |
| 952 | "&vrlw (@x[$b0],@x[$b0],'$twelve')", |
| 953 | "&vrlw (@x[$b1],@x[$b1],'$twelve')", |
| 954 | "&vrlw (@x[$b2],@x[$b2],'$twelve')", |
| 955 | "&vrlw (@x[$b3],@x[$b3],'$twelve')", |
| 956 | |
| 957 | "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", |
| 958 | "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", |
| 959 | "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", |
| 960 | "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", |
| 961 | "&vxor (@x[$d0],@x[$d0],@x[$a0])", |
| 962 | "&vxor (@x[$d1],@x[$d1],@x[$a1])", |
| 963 | "&vxor (@x[$d2],@x[$d2],@x[$a2])", |
| 964 | "&vxor (@x[$d3],@x[$d3],@x[$a3])", |
| 965 | "&vrlw (@x[$d0],@x[$d0],'$eight')", |
| 966 | "&vrlw (@x[$d1],@x[$d1],'$eight')", |
| 967 | "&vrlw (@x[$d2],@x[$d2],'$eight')", |
| 968 | "&vrlw (@x[$d3],@x[$d3],'$eight')", |
| 969 | |
| 970 | "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", |
| 971 | "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", |
| 972 | "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", |
| 973 | "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", |
| 974 | "&vxor (@x[$b0],@x[$b0],@x[$c0])", |
| 975 | "&vxor (@x[$b1],@x[$b1],@x[$c1])", |
| 976 | "&vxor (@x[$b2],@x[$b2],@x[$c2])", |
| 977 | "&vxor (@x[$b3],@x[$b3],@x[$c3])", |
| 978 | "&vrlw (@x[$b0],@x[$b0],'$seven')", |
| 979 | "&vrlw (@x[$b1],@x[$b1],'$seven')", |
| 980 | "&vrlw (@x[$b2],@x[$b2],'$seven')", |
| 981 | "&vrlw (@x[$b3],@x[$b3],'$seven')" |
| 982 | ); |
| 983 | } |
| 984 | |
| 985 | $code.=<<___; |
| 986 | |
| 987 | .globl .ChaCha20_ctr32_vsx |
| 988 | .align 5 |
| 989 | .ChaCha20_ctr32_vsx: |
| 990 | $STU $sp,-$FRAME($sp) |
| 991 | mflr r0 |
| 992 | li r10,`15+$LOCALS+64` |
| 993 | li r11,`31+$LOCALS+64` |
| 994 | mfspr r12,256 |
| 995 | stvx v26,r10,$sp |
| 996 | addi r10,r10,32 |
| 997 | stvx v27,r11,$sp |
| 998 | addi r11,r11,32 |
| 999 | stvx v28,r10,$sp |
| 1000 | addi r10,r10,32 |
| 1001 | stvx v29,r11,$sp |
| 1002 | addi r11,r11,32 |
| 1003 | stvx v30,r10,$sp |
| 1004 | stvx v31,r11,$sp |
| 1005 | stw r12,`$FRAME-4`($sp) # save vrsave |
| 1006 | li r12,-4096+63 |
| 1007 | $PUSH r0, `$FRAME+$LRSAVE`($sp) |
| 1008 | mtspr 256,r12 # preserve 29 AltiVec registers |
| 1009 | |
| 1010 | bl Lconsts # returns pointer Lsigma in r12 |
| 1011 | lvx_4w @K[0],0,r12 # load sigma |
| 1012 | addi r12,r12,0x50 |
| 1013 | li $x10,16 |
| 1014 | li $x20,32 |
| 1015 | li $x30,48 |
| 1016 | li r11,64 |
| 1017 | |
| 1018 | lvx_4w @K[1],0,$key # load key |
| 1019 | lvx_4w @K[2],$x10,$key |
| 1020 | lvx_4w @K[3],0,$ctr # load counter |
| 1021 | |
| 1022 | vxor $xt0,$xt0,$xt0 |
| 1023 | lvx_4w $xt1,r11,r12 |
| 1024 | vspltw $CTR,@K[3],0 |
| 1025 | vsldoi @K[3],@K[3],$xt0,4 |
| 1026 | vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] |
| 1027 | vadduwm $CTR,$CTR,$xt1 |
| 1028 | |
| 1029 | be?lvsl $beperm,0,$x10 # 0x00..0f |
| 1030 | be?vspltisb $xt0,3 # 0x03..03 |
| 1031 | be?vxor $beperm,$beperm,$xt0 # swap bytes within words |
| 1032 | |
| 1033 | li r0,10 # inner loop counter |
| 1034 | mtctr r0 |
| 1035 | b Loop_outer_vsx |
| 1036 | |
| 1037 | .align 5 |
| 1038 | Loop_outer_vsx: |
| 1039 | lvx $xa0,$x00,r12 # load [smashed] sigma |
| 1040 | lvx $xa1,$x10,r12 |
| 1041 | lvx $xa2,$x20,r12 |
| 1042 | lvx $xa3,$x30,r12 |
| 1043 | |
| 1044 | vspltw $xb0,@K[1],0 # smash the key |
| 1045 | vspltw $xb1,@K[1],1 |
| 1046 | vspltw $xb2,@K[1],2 |
| 1047 | vspltw $xb3,@K[1],3 |
| 1048 | |
| 1049 | vspltw $xc0,@K[2],0 |
| 1050 | vspltw $xc1,@K[2],1 |
| 1051 | vspltw $xc2,@K[2],2 |
| 1052 | vspltw $xc3,@K[2],3 |
| 1053 | |
| 1054 | vmr $xd0,$CTR # smash the counter |
| 1055 | vspltw $xd1,@K[3],1 |
| 1056 | vspltw $xd2,@K[3],2 |
| 1057 | vspltw $xd3,@K[3],3 |
| 1058 | |
| 1059 | vspltisw $sixteen,-16 # synthesize constants |
| 1060 | vspltisw $twelve,12 |
| 1061 | vspltisw $eight,8 |
| 1062 | vspltisw $seven,7 |
| 1063 | |
| 1064 | Loop_vsx: |
| 1065 | ___ |
| 1066 | foreach (&VSX_lane_ROUND(0, 4, 8,12)) { eval; } |
| 1067 | foreach (&VSX_lane_ROUND(0, 5,10,15)) { eval; } |
| 1068 | $code.=<<___; |
| 1069 | bdnz Loop_vsx |
| 1070 | |
| 1071 | vadduwm $xd0,$xd0,$CTR |
| 1072 | |
| 1073 | vmrgew $xt0,$xa0,$xa1 # transpose data |
| 1074 | vmrgew $xt1,$xa2,$xa3 |
| 1075 | vmrgow $xa0,$xa0,$xa1 |
| 1076 | vmrgow $xa2,$xa2,$xa3 |
| 1077 | vmrgew $xt2,$xb0,$xb1 |
| 1078 | vmrgew $xt3,$xb2,$xb3 |
| 1079 | vpermdi $xa1,$xa0,$xa2,0b00 |
| 1080 | vpermdi $xa3,$xa0,$xa2,0b11 |
| 1081 | vpermdi $xa0,$xt0,$xt1,0b00 |
| 1082 | vpermdi $xa2,$xt0,$xt1,0b11 |
| 1083 | |
| 1084 | vmrgow $xb0,$xb0,$xb1 |
| 1085 | vmrgow $xb2,$xb2,$xb3 |
| 1086 | vmrgew $xt0,$xc0,$xc1 |
| 1087 | vmrgew $xt1,$xc2,$xc3 |
| 1088 | vpermdi $xb1,$xb0,$xb2,0b00 |
| 1089 | vpermdi $xb3,$xb0,$xb2,0b11 |
| 1090 | vpermdi $xb0,$xt2,$xt3,0b00 |
| 1091 | vpermdi $xb2,$xt2,$xt3,0b11 |
| 1092 | |
| 1093 | vmrgow $xc0,$xc0,$xc1 |
| 1094 | vmrgow $xc2,$xc2,$xc3 |
| 1095 | vmrgew $xt2,$xd0,$xd1 |
| 1096 | vmrgew $xt3,$xd2,$xd3 |
| 1097 | vpermdi $xc1,$xc0,$xc2,0b00 |
| 1098 | vpermdi $xc3,$xc0,$xc2,0b11 |
| 1099 | vpermdi $xc0,$xt0,$xt1,0b00 |
| 1100 | vpermdi $xc2,$xt0,$xt1,0b11 |
| 1101 | |
| 1102 | vmrgow $xd0,$xd0,$xd1 |
| 1103 | vmrgow $xd2,$xd2,$xd3 |
| 1104 | vspltisw $xt0,4 |
| 1105 | vadduwm $CTR,$CTR,$xt0 # next counter value |
| 1106 | vpermdi $xd1,$xd0,$xd2,0b00 |
| 1107 | vpermdi $xd3,$xd0,$xd2,0b11 |
| 1108 | vpermdi $xd0,$xt2,$xt3,0b00 |
| 1109 | vpermdi $xd2,$xt2,$xt3,0b11 |
| 1110 | |
| 1111 | vadduwm $xa0,$xa0,@K[0] |
| 1112 | vadduwm $xb0,$xb0,@K[1] |
| 1113 | vadduwm $xc0,$xc0,@K[2] |
| 1114 | vadduwm $xd0,$xd0,@K[3] |
| 1115 | |
| 1116 | be?vperm $xa0,$xa0,$xa0,$beperm |
| 1117 | be?vperm $xb0,$xb0,$xb0,$beperm |
| 1118 | be?vperm $xc0,$xc0,$xc0,$beperm |
| 1119 | be?vperm $xd0,$xd0,$xd0,$beperm |
| 1120 | |
| 1121 | ${UCMP}i $len,0x40 |
| 1122 | blt Ltail_vsx |
| 1123 | |
| 1124 | lvx_4w $xt0,$x00,$inp |
| 1125 | lvx_4w $xt1,$x10,$inp |
| 1126 | lvx_4w $xt2,$x20,$inp |
| 1127 | lvx_4w $xt3,$x30,$inp |
| 1128 | |
| 1129 | vxor $xt0,$xt0,$xa0 |
| 1130 | vxor $xt1,$xt1,$xb0 |
| 1131 | vxor $xt2,$xt2,$xc0 |
| 1132 | vxor $xt3,$xt3,$xd0 |
| 1133 | |
| 1134 | stvx_4w $xt0,$x00,$out |
| 1135 | stvx_4w $xt1,$x10,$out |
| 1136 | addi $inp,$inp,0x40 |
| 1137 | stvx_4w $xt2,$x20,$out |
| 1138 | subi $len,$len,0x40 |
| 1139 | stvx_4w $xt3,$x30,$out |
| 1140 | addi $out,$out,0x40 |
| 1141 | beq Ldone_vsx |
| 1142 | |
| 1143 | vadduwm $xa0,$xa1,@K[0] |
| 1144 | vadduwm $xb0,$xb1,@K[1] |
| 1145 | vadduwm $xc0,$xc1,@K[2] |
| 1146 | vadduwm $xd0,$xd1,@K[3] |
| 1147 | |
| 1148 | be?vperm $xa0,$xa0,$xa0,$beperm |
| 1149 | be?vperm $xb0,$xb0,$xb0,$beperm |
| 1150 | be?vperm $xc0,$xc0,$xc0,$beperm |
| 1151 | be?vperm $xd0,$xd0,$xd0,$beperm |
| 1152 | |
| 1153 | ${UCMP}i $len,0x40 |
| 1154 | blt Ltail_vsx |
| 1155 | |
| 1156 | lvx_4w $xt0,$x00,$inp |
| 1157 | lvx_4w $xt1,$x10,$inp |
| 1158 | lvx_4w $xt2,$x20,$inp |
| 1159 | lvx_4w $xt3,$x30,$inp |
| 1160 | |
| 1161 | vxor $xt0,$xt0,$xa0 |
| 1162 | vxor $xt1,$xt1,$xb0 |
| 1163 | vxor $xt2,$xt2,$xc0 |
| 1164 | vxor $xt3,$xt3,$xd0 |
| 1165 | |
| 1166 | stvx_4w $xt0,$x00,$out |
| 1167 | stvx_4w $xt1,$x10,$out |
| 1168 | addi $inp,$inp,0x40 |
| 1169 | stvx_4w $xt2,$x20,$out |
| 1170 | subi $len,$len,0x40 |
| 1171 | stvx_4w $xt3,$x30,$out |
| 1172 | addi $out,$out,0x40 |
| 1173 | beq Ldone_vsx |
| 1174 | |
| 1175 | vadduwm $xa0,$xa2,@K[0] |
| 1176 | vadduwm $xb0,$xb2,@K[1] |
| 1177 | vadduwm $xc0,$xc2,@K[2] |
| 1178 | vadduwm $xd0,$xd2,@K[3] |
| 1179 | |
| 1180 | be?vperm $xa0,$xa0,$xa0,$beperm |
| 1181 | be?vperm $xb0,$xb0,$xb0,$beperm |
| 1182 | be?vperm $xc0,$xc0,$xc0,$beperm |
| 1183 | be?vperm $xd0,$xd0,$xd0,$beperm |
| 1184 | |
| 1185 | ${UCMP}i $len,0x40 |
| 1186 | blt Ltail_vsx |
| 1187 | |
| 1188 | lvx_4w $xt0,$x00,$inp |
| 1189 | lvx_4w $xt1,$x10,$inp |
| 1190 | lvx_4w $xt2,$x20,$inp |
| 1191 | lvx_4w $xt3,$x30,$inp |
| 1192 | |
| 1193 | vxor $xt0,$xt0,$xa0 |
| 1194 | vxor $xt1,$xt1,$xb0 |
| 1195 | vxor $xt2,$xt2,$xc0 |
| 1196 | vxor $xt3,$xt3,$xd0 |
| 1197 | |
| 1198 | stvx_4w $xt0,$x00,$out |
| 1199 | stvx_4w $xt1,$x10,$out |
| 1200 | addi $inp,$inp,0x40 |
| 1201 | stvx_4w $xt2,$x20,$out |
| 1202 | subi $len,$len,0x40 |
| 1203 | stvx_4w $xt3,$x30,$out |
| 1204 | addi $out,$out,0x40 |
| 1205 | beq Ldone_vsx |
| 1206 | |
| 1207 | vadduwm $xa0,$xa3,@K[0] |
| 1208 | vadduwm $xb0,$xb3,@K[1] |
| 1209 | vadduwm $xc0,$xc3,@K[2] |
| 1210 | vadduwm $xd0,$xd3,@K[3] |
| 1211 | |
| 1212 | be?vperm $xa0,$xa0,$xa0,$beperm |
| 1213 | be?vperm $xb0,$xb0,$xb0,$beperm |
| 1214 | be?vperm $xc0,$xc0,$xc0,$beperm |
| 1215 | be?vperm $xd0,$xd0,$xd0,$beperm |
| 1216 | |
| 1217 | ${UCMP}i $len,0x40 |
| 1218 | blt Ltail_vsx |
| 1219 | |
| 1220 | lvx_4w $xt0,$x00,$inp |
| 1221 | lvx_4w $xt1,$x10,$inp |
| 1222 | lvx_4w $xt2,$x20,$inp |
| 1223 | lvx_4w $xt3,$x30,$inp |
| 1224 | |
| 1225 | vxor $xt0,$xt0,$xa0 |
| 1226 | vxor $xt1,$xt1,$xb0 |
| 1227 | vxor $xt2,$xt2,$xc0 |
| 1228 | vxor $xt3,$xt3,$xd0 |
| 1229 | |
| 1230 | stvx_4w $xt0,$x00,$out |
| 1231 | stvx_4w $xt1,$x10,$out |
| 1232 | addi $inp,$inp,0x40 |
| 1233 | stvx_4w $xt2,$x20,$out |
| 1234 | subi $len,$len,0x40 |
| 1235 | stvx_4w $xt3,$x30,$out |
| 1236 | addi $out,$out,0x40 |
| 1237 | mtctr r0 |
| 1238 | bne Loop_outer_vsx |
| 1239 | |
| 1240 | Ldone_vsx: |
| 1241 | lwz r12,`$FRAME-4`($sp) # pull vrsave |
| 1242 | li r10,`15+$LOCALS+64` |
| 1243 | li r11,`31+$LOCALS+64` |
| 1244 | $POP r0, `$FRAME+$LRSAVE`($sp) |
| 1245 | mtspr 256,r12 # restore vrsave |
| 1246 | lvx v26,r10,$sp |
| 1247 | addi r10,r10,32 |
| 1248 | lvx v27,r11,$sp |
| 1249 | addi r11,r11,32 |
| 1250 | lvx v28,r10,$sp |
| 1251 | addi r10,r10,32 |
| 1252 | lvx v29,r11,$sp |
| 1253 | addi r11,r11,32 |
| 1254 | lvx v30,r10,$sp |
| 1255 | lvx v31,r11,$sp |
| 1256 | mtlr r0 |
| 1257 | addi $sp,$sp,$FRAME |
| 1258 | blr |
| 1259 | |
| 1260 | .align 4 |
| 1261 | Ltail_vsx: |
| 1262 | addi r11,$sp,$LOCALS |
| 1263 | mtctr $len |
| 1264 | stvx_4w $xa0,$x00,r11 # offload block to stack |
| 1265 | stvx_4w $xb0,$x10,r11 |
| 1266 | stvx_4w $xc0,$x20,r11 |
| 1267 | stvx_4w $xd0,$x30,r11 |
| 1268 | subi r12,r11,1 # prepare for *++ptr |
| 1269 | subi $inp,$inp,1 |
| 1270 | subi $out,$out,1 |
| 1271 | |
| 1272 | Loop_tail_vsx: |
| 1273 | lbzu r6,1(r12) |
| 1274 | lbzu r7,1($inp) |
| 1275 | xor r6,r6,r7 |
| 1276 | stbu r6,1($out) |
| 1277 | bdnz Loop_tail_vsx |
| 1278 | |
| 1279 | stvx_4w $K[0],$x00,r11 # wipe copy of the block |
| 1280 | stvx_4w $K[0],$x10,r11 |
| 1281 | stvx_4w $K[0],$x20,r11 |
| 1282 | stvx_4w $K[0],$x30,r11 |
| 1283 | |
| 1284 | b Ldone_vsx |
| 1285 | .long 0 |
| 1286 | .byte 0,12,0x04,1,0x80,0,5,0 |
| 1287 | .long 0 |
| 1288 | .size .ChaCha20_ctr32_vsx,.-.ChaCha20_ctr32_vsx |
| 1289 | ___ |
| 1290 | }}} |
| 1291 | $code.=<<___; |
| 1292 | .align 5 |
| 1293 | Lconsts: |
| 1294 | mflr r0 |
| 1295 | bcl 20,31,\$+4 |
| 1296 | mflr r12 #vvvvv "distance between . and Lsigma |
| 1297 | addi r12,r12,`64-8` |
| 1298 | mtlr r0 |
| 1299 | blr |
| 1300 | .long 0 |
| 1301 | .byte 0,12,0x14,0,0,0,0,0 |
| 1302 | .space `64-9*4` |
| 1303 | Lsigma: |
| 1304 | .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 |
| 1305 | .long 1,0,0,0 |
| 1306 | .long 4,0,0,0 |
| 1307 | ___ |
| 1308 | $code.=<<___ if ($LITTLE_ENDIAN); |
| 1309 | .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001 |
| 1310 | .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300 |
| 1311 | ___ |
| 1312 | $code.=<<___ if (!$LITTLE_ENDIAN); # flipped words |
| 1313 | .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d |
| 1314 | .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c |
| 1315 | ___ |
| 1316 | $code.=<<___; |
| 1317 | .long 0x61707865,0x61707865,0x61707865,0x61707865 |
| 1318 | .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e |
| 1319 | .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 |
| 1320 | .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 |
| 1321 | .long 0,1,2,3 |
| 1322 | .asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>" |
| 1323 | .align 2 |
| 1324 | ___ |
| 1325 | |
| 1326 | foreach (split("\n",$code)) { |
| 1327 | s/\`([^\`]*)\`/eval $1/ge; |
| 1328 | |
| 1329 | # instructions prefixed with '?' are endian-specific and need |
| 1330 | # to be adjusted accordingly... |
| 1331 | if ($flavour !~ /le$/) { # big-endian |
| 1332 | s/be\?// or |
| 1333 | s/le\?/#le#/ or |
| 1334 | s/\?lvsr/lvsl/ or |
| 1335 | s/\?lvsl/lvsr/ or |
| 1336 | s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or |
| 1337 | s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; |
| 1338 | } else { # little-endian |
| 1339 | s/le\?// or |
| 1340 | s/be\?/#be#/ or |
| 1341 | s/\?([a-z]+)/$1/ or |
| 1342 | s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; |
| 1343 | } |
| 1344 | |
| 1345 | print $_,"\n"; |
| 1346 | } |
| 1347 | |
| 1348 | close STDOUT or die "error closing STDOUT: $!"; |