yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #!/usr/bin/env perl |
| 2 | # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | # |
| 9 | # ==================================================================== |
| 10 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 11 | # project. The module is, however, dual licensed under OpenSSL and |
| 12 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 13 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 14 | # ==================================================================== |
| 15 | # |
| 16 | # Keccak-1600 for PPC64. |
| 17 | # |
| 18 | # June 2017. |
| 19 | # |
| 20 | # This is straightforward KECCAK_1X_ALT implementation that works on |
| 21 | # *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and |
| 22 | # it's possible to achieve performance better than below, but that is |
| 23 | # naturally option only for POWER8 and successors... |
| 24 | # |
| 25 | ###################################################################### |
| 26 | # Numbers are cycles per processed byte. |
| 27 | # |
| 28 | # r=1088(*) |
| 29 | # |
| 30 | # PPC970/G5 14.6/+120% |
| 31 | # POWER7 10.3/+100% |
| 32 | # POWER8 11.5/+85% |
| 33 | # POWER9 9.4/+45% |
| 34 | # |
| 35 | # (*) Corresponds to SHA3-256. Percentage after slash is improvement |
| 36 | # over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do |
| 37 | # much better (but watch out for them generating code specific |
| 38 | # to processor they execute on). |
| 39 | |
| 40 | $flavour = shift; |
| 41 | |
| 42 | if ($flavour =~ /64/) { |
| 43 | $SIZE_T =8; |
| 44 | $LRSAVE =2*$SIZE_T; |
| 45 | $UCMP ="cmpld"; |
| 46 | $STU ="stdu"; |
| 47 | $POP ="ld"; |
| 48 | $PUSH ="std"; |
| 49 | } else { die "nonsense $flavour"; } |
| 50 | |
| 51 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 52 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 53 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| 54 | die "can't locate ppc-xlate.pl"; |
| 55 | |
| 56 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 57 | |
| 58 | $FRAME=24*$SIZE_T+6*$SIZE_T+32; |
| 59 | $LOCALS=6*$SIZE_T; |
| 60 | $TEMP=$LOCALS+6*$SIZE_T; |
| 61 | |
| 62 | my $sp ="r1"; |
| 63 | |
| 64 | my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ], |
| 65 | (7, 12, 17, 22, 27)); |
| 66 | $A[1][1] = "r6"; # r13 is reserved |
| 67 | |
| 68 | my @C = map("r$_", (0,3,4,5)); |
| 69 | |
| 70 | my @rhotates = ([ 0, 1, 62, 28, 27 ], |
| 71 | [ 36, 44, 6, 55, 20 ], |
| 72 | [ 3, 10, 43, 25, 39 ], |
| 73 | [ 41, 45, 15, 21, 8 ], |
| 74 | [ 18, 2, 61, 56, 14 ]); |
| 75 | |
| 76 | $code.=<<___; |
| 77 | .text |
| 78 | |
| 79 | .type KeccakF1600_int,\@function |
| 80 | .align 5 |
| 81 | KeccakF1600_int: |
| 82 | li r0,24 |
| 83 | mtctr r0 |
| 84 | b .Loop |
| 85 | .align 4 |
| 86 | .Loop: |
| 87 | xor $C[0],$A[0][0],$A[1][0] ; Theta |
| 88 | std $A[0][4],`$TEMP+0`($sp) |
| 89 | xor $C[1],$A[0][1],$A[1][1] |
| 90 | std $A[1][4],`$TEMP+8`($sp) |
| 91 | xor $C[2],$A[0][2],$A[1][2] |
| 92 | std $A[2][4],`$TEMP+16`($sp) |
| 93 | xor $C[3],$A[0][3],$A[1][3] |
| 94 | std $A[3][4],`$TEMP+24`($sp) |
| 95 | ___ |
| 96 | $C[4]=$A[0][4]; |
| 97 | $C[5]=$A[1][4]; |
| 98 | $C[6]=$A[2][4]; |
| 99 | $C[7]=$A[3][4]; |
| 100 | $code.=<<___; |
| 101 | xor $C[4],$A[0][4],$A[1][4] |
| 102 | xor $C[0],$C[0],$A[2][0] |
| 103 | xor $C[1],$C[1],$A[2][1] |
| 104 | xor $C[2],$C[2],$A[2][2] |
| 105 | xor $C[3],$C[3],$A[2][3] |
| 106 | xor $C[4],$C[4],$A[2][4] |
| 107 | xor $C[0],$C[0],$A[3][0] |
| 108 | xor $C[1],$C[1],$A[3][1] |
| 109 | xor $C[2],$C[2],$A[3][2] |
| 110 | xor $C[3],$C[3],$A[3][3] |
| 111 | xor $C[4],$C[4],$A[3][4] |
| 112 | xor $C[0],$C[0],$A[4][0] |
| 113 | xor $C[2],$C[2],$A[4][2] |
| 114 | xor $C[1],$C[1],$A[4][1] |
| 115 | xor $C[3],$C[3],$A[4][3] |
| 116 | rotldi $C[5],$C[2],1 |
| 117 | xor $C[4],$C[4],$A[4][4] |
| 118 | rotldi $C[6],$C[3],1 |
| 119 | xor $C[5],$C[5],$C[0] |
| 120 | rotldi $C[7],$C[4],1 |
| 121 | |
| 122 | xor $A[0][1],$A[0][1],$C[5] |
| 123 | xor $A[1][1],$A[1][1],$C[5] |
| 124 | xor $A[2][1],$A[2][1],$C[5] |
| 125 | xor $A[3][1],$A[3][1],$C[5] |
| 126 | xor $A[4][1],$A[4][1],$C[5] |
| 127 | |
| 128 | rotldi $C[5],$C[0],1 |
| 129 | xor $C[6],$C[6],$C[1] |
| 130 | xor $C[2],$C[2],$C[7] |
| 131 | rotldi $C[7],$C[1],1 |
| 132 | xor $C[3],$C[3],$C[5] |
| 133 | xor $C[4],$C[4],$C[7] |
| 134 | |
| 135 | xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2] |
| 136 | xor $A[1][2],$A[1][2],$C[6] |
| 137 | xor $A[2][2],$A[2][2],$C[6] |
| 138 | xor $A[3][2],$A[3][2],$C[6] |
| 139 | xor $A[4][2],$A[4][2],$C[6] |
| 140 | |
| 141 | xor $A[0][0],$A[0][0],$C[4] |
| 142 | xor $A[1][0],$A[1][0],$C[4] |
| 143 | xor $A[2][0],$A[2][0],$C[4] |
| 144 | xor $A[3][0],$A[3][0],$C[4] |
| 145 | xor $A[4][0],$A[4][0],$C[4] |
| 146 | ___ |
| 147 | $C[4]=undef; |
| 148 | $C[5]=undef; |
| 149 | $C[6]=undef; |
| 150 | $C[7]=undef; |
| 151 | $code.=<<___; |
| 152 | ld $A[0][4],`$TEMP+0`($sp) |
| 153 | xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3] |
| 154 | ld $A[1][4],`$TEMP+8`($sp) |
| 155 | xor $A[1][3],$A[1][3],$C[2] |
| 156 | ld $A[2][4],`$TEMP+16`($sp) |
| 157 | xor $A[2][3],$A[2][3],$C[2] |
| 158 | ld $A[3][4],`$TEMP+24`($sp) |
| 159 | xor $A[3][3],$A[3][3],$C[2] |
| 160 | xor $A[4][3],$A[4][3],$C[2] |
| 161 | |
| 162 | xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4] |
| 163 | xor $A[1][4],$A[1][4],$C[3] |
| 164 | xor $A[2][4],$A[2][4],$C[3] |
| 165 | xor $A[3][4],$A[3][4],$C[3] |
| 166 | xor $A[4][4],$A[4][4],$C[3] |
| 167 | |
| 168 | mr $C[3],$A[0][1] ; Rho+Pi |
| 169 | rotldi $A[0][1],$A[1][1],$rhotates[1][1] |
| 170 | ;mr $C[1],$A[0][2] |
| 171 | rotldi $A[0][2],$A[2][2],$rhotates[2][2] |
| 172 | ;mr $C[0],$A[0][3] |
| 173 | rotldi $A[0][3],$A[3][3],$rhotates[3][3] |
| 174 | ;mr $C[2],$A[0][4] |
| 175 | rotldi $A[0][4],$A[4][4],$rhotates[4][4] |
| 176 | |
| 177 | rotldi $A[1][1],$A[1][4],$rhotates[1][4] |
| 178 | rotldi $A[2][2],$A[2][3],$rhotates[2][3] |
| 179 | rotldi $A[3][3],$A[3][2],$rhotates[3][2] |
| 180 | rotldi $A[4][4],$A[4][1],$rhotates[4][1] |
| 181 | |
| 182 | rotldi $A[1][4],$A[4][2],$rhotates[4][2] |
| 183 | rotldi $A[2][3],$A[3][4],$rhotates[3][4] |
| 184 | rotldi $A[3][2],$A[2][1],$rhotates[2][1] |
| 185 | rotldi $A[4][1],$A[1][3],$rhotates[1][3] |
| 186 | |
| 187 | rotldi $A[4][2],$A[2][4],$rhotates[2][4] |
| 188 | rotldi $A[3][4],$A[4][3],$rhotates[4][3] |
| 189 | rotldi $A[2][1],$A[1][2],$rhotates[1][2] |
| 190 | rotldi $A[1][3],$A[3][1],$rhotates[3][1] |
| 191 | |
| 192 | rotldi $A[2][4],$A[4][0],$rhotates[4][0] |
| 193 | rotldi $A[4][3],$A[3][0],$rhotates[3][0] |
| 194 | rotldi $A[1][2],$A[2][0],$rhotates[2][0] |
| 195 | rotldi $A[3][1],$A[1][0],$rhotates[1][0] |
| 196 | |
| 197 | rotldi $A[1][0],$C[0],$rhotates[0][3] |
| 198 | rotldi $A[2][0],$C[3],$rhotates[0][1] |
| 199 | rotldi $A[3][0],$C[2],$rhotates[0][4] |
| 200 | rotldi $A[4][0],$C[1],$rhotates[0][2] |
| 201 | |
| 202 | andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota |
| 203 | andc $C[1],$A[0][3],$A[0][2] |
| 204 | andc $C[2],$A[0][0],$A[0][4] |
| 205 | andc $C[3],$A[0][1],$A[0][0] |
| 206 | xor $A[0][0],$A[0][0],$C[0] |
| 207 | andc $C[0],$A[0][4],$A[0][3] |
| 208 | xor $A[0][1],$A[0][1],$C[1] |
| 209 | ld $C[1],`$LOCALS+4*$SIZE_T`($sp) |
| 210 | xor $A[0][3],$A[0][3],$C[2] |
| 211 | xor $A[0][4],$A[0][4],$C[3] |
| 212 | xor $A[0][2],$A[0][2],$C[0] |
| 213 | ldu $C[3],8($C[1]) ; Iota[i++] |
| 214 | |
| 215 | andc $C[0],$A[1][2],$A[1][1] |
| 216 | std $C[1],`$LOCALS+4*$SIZE_T`($sp) |
| 217 | andc $C[1],$A[1][3],$A[1][2] |
| 218 | andc $C[2],$A[1][0],$A[1][4] |
| 219 | xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota |
| 220 | andc $C[3],$A[1][1],$A[1][0] |
| 221 | xor $A[1][0],$A[1][0],$C[0] |
| 222 | andc $C[0],$A[1][4],$A[1][3] |
| 223 | xor $A[1][1],$A[1][1],$C[1] |
| 224 | xor $A[1][3],$A[1][3],$C[2] |
| 225 | xor $A[1][4],$A[1][4],$C[3] |
| 226 | xor $A[1][2],$A[1][2],$C[0] |
| 227 | |
| 228 | andc $C[0],$A[2][2],$A[2][1] |
| 229 | andc $C[1],$A[2][3],$A[2][2] |
| 230 | andc $C[2],$A[2][0],$A[2][4] |
| 231 | andc $C[3],$A[2][1],$A[2][0] |
| 232 | xor $A[2][0],$A[2][0],$C[0] |
| 233 | andc $C[0],$A[2][4],$A[2][3] |
| 234 | xor $A[2][1],$A[2][1],$C[1] |
| 235 | xor $A[2][3],$A[2][3],$C[2] |
| 236 | xor $A[2][4],$A[2][4],$C[3] |
| 237 | xor $A[2][2],$A[2][2],$C[0] |
| 238 | |
| 239 | andc $C[0],$A[3][2],$A[3][1] |
| 240 | andc $C[1],$A[3][3],$A[3][2] |
| 241 | andc $C[2],$A[3][0],$A[3][4] |
| 242 | andc $C[3],$A[3][1],$A[3][0] |
| 243 | xor $A[3][0],$A[3][0],$C[0] |
| 244 | andc $C[0],$A[3][4],$A[3][3] |
| 245 | xor $A[3][1],$A[3][1],$C[1] |
| 246 | xor $A[3][3],$A[3][3],$C[2] |
| 247 | xor $A[3][4],$A[3][4],$C[3] |
| 248 | xor $A[3][2],$A[3][2],$C[0] |
| 249 | |
| 250 | andc $C[0],$A[4][2],$A[4][1] |
| 251 | andc $C[1],$A[4][3],$A[4][2] |
| 252 | andc $C[2],$A[4][0],$A[4][4] |
| 253 | andc $C[3],$A[4][1],$A[4][0] |
| 254 | xor $A[4][0],$A[4][0],$C[0] |
| 255 | andc $C[0],$A[4][4],$A[4][3] |
| 256 | xor $A[4][1],$A[4][1],$C[1] |
| 257 | xor $A[4][3],$A[4][3],$C[2] |
| 258 | xor $A[4][4],$A[4][4],$C[3] |
| 259 | xor $A[4][2],$A[4][2],$C[0] |
| 260 | |
| 261 | bdnz .Loop |
| 262 | |
| 263 | blr |
| 264 | .long 0 |
| 265 | .byte 0,12,0x14,0,0,0,0,0 |
| 266 | .size KeccakF1600_int,.-KeccakF1600_int |
| 267 | |
| 268 | .type KeccakF1600,\@function |
| 269 | .align 5 |
| 270 | KeccakF1600: |
| 271 | $STU $sp,-$FRAME($sp) |
| 272 | mflr r0 |
| 273 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| 274 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
| 275 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
| 276 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
| 277 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) |
| 278 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) |
| 279 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) |
| 280 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) |
| 281 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) |
| 282 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) |
| 283 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) |
| 284 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) |
| 285 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) |
| 286 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) |
| 287 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) |
| 288 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 289 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 290 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 291 | $PUSH r0,`$FRAME+$LRSAVE`($sp) |
| 292 | |
| 293 | bl PICmeup |
| 294 | subi r12,r12,8 ; prepare for ldu |
| 295 | |
| 296 | $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) |
| 297 | ;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp) |
| 298 | ;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp) |
| 299 | ;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp) |
| 300 | $PUSH r12,`$LOCALS+4*$SIZE_T`($sp) |
| 301 | |
| 302 | ld $A[0][0],`8*0`(r3) ; load A[5][5] |
| 303 | ld $A[0][1],`8*1`(r3) |
| 304 | ld $A[0][2],`8*2`(r3) |
| 305 | ld $A[0][3],`8*3`(r3) |
| 306 | ld $A[0][4],`8*4`(r3) |
| 307 | ld $A[1][0],`8*5`(r3) |
| 308 | ld $A[1][1],`8*6`(r3) |
| 309 | ld $A[1][2],`8*7`(r3) |
| 310 | ld $A[1][3],`8*8`(r3) |
| 311 | ld $A[1][4],`8*9`(r3) |
| 312 | ld $A[2][0],`8*10`(r3) |
| 313 | ld $A[2][1],`8*11`(r3) |
| 314 | ld $A[2][2],`8*12`(r3) |
| 315 | ld $A[2][3],`8*13`(r3) |
| 316 | ld $A[2][4],`8*14`(r3) |
| 317 | ld $A[3][0],`8*15`(r3) |
| 318 | ld $A[3][1],`8*16`(r3) |
| 319 | ld $A[3][2],`8*17`(r3) |
| 320 | ld $A[3][3],`8*18`(r3) |
| 321 | ld $A[3][4],`8*19`(r3) |
| 322 | ld $A[4][0],`8*20`(r3) |
| 323 | ld $A[4][1],`8*21`(r3) |
| 324 | ld $A[4][2],`8*22`(r3) |
| 325 | ld $A[4][3],`8*23`(r3) |
| 326 | ld $A[4][4],`8*24`(r3) |
| 327 | |
| 328 | bl KeccakF1600_int |
| 329 | |
| 330 | $POP r3,`$LOCALS+0*$SIZE_T`($sp) |
| 331 | std $A[0][0],`8*0`(r3) ; return A[5][5] |
| 332 | std $A[0][1],`8*1`(r3) |
| 333 | std $A[0][2],`8*2`(r3) |
| 334 | std $A[0][3],`8*3`(r3) |
| 335 | std $A[0][4],`8*4`(r3) |
| 336 | std $A[1][0],`8*5`(r3) |
| 337 | std $A[1][1],`8*6`(r3) |
| 338 | std $A[1][2],`8*7`(r3) |
| 339 | std $A[1][3],`8*8`(r3) |
| 340 | std $A[1][4],`8*9`(r3) |
| 341 | std $A[2][0],`8*10`(r3) |
| 342 | std $A[2][1],`8*11`(r3) |
| 343 | std $A[2][2],`8*12`(r3) |
| 344 | std $A[2][3],`8*13`(r3) |
| 345 | std $A[2][4],`8*14`(r3) |
| 346 | std $A[3][0],`8*15`(r3) |
| 347 | std $A[3][1],`8*16`(r3) |
| 348 | std $A[3][2],`8*17`(r3) |
| 349 | std $A[3][3],`8*18`(r3) |
| 350 | std $A[3][4],`8*19`(r3) |
| 351 | std $A[4][0],`8*20`(r3) |
| 352 | std $A[4][1],`8*21`(r3) |
| 353 | std $A[4][2],`8*22`(r3) |
| 354 | std $A[4][3],`8*23`(r3) |
| 355 | std $A[4][4],`8*24`(r3) |
| 356 | |
| 357 | $POP r0,`$FRAME+$LRSAVE`($sp) |
| 358 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
| 359 | $POP r15,`$FRAME-$SIZE_T*17`($sp) |
| 360 | $POP r16,`$FRAME-$SIZE_T*16`($sp) |
| 361 | $POP r17,`$FRAME-$SIZE_T*15`($sp) |
| 362 | $POP r18,`$FRAME-$SIZE_T*14`($sp) |
| 363 | $POP r19,`$FRAME-$SIZE_T*13`($sp) |
| 364 | $POP r20,`$FRAME-$SIZE_T*12`($sp) |
| 365 | $POP r21,`$FRAME-$SIZE_T*11`($sp) |
| 366 | $POP r22,`$FRAME-$SIZE_T*10`($sp) |
| 367 | $POP r23,`$FRAME-$SIZE_T*9`($sp) |
| 368 | $POP r24,`$FRAME-$SIZE_T*8`($sp) |
| 369 | $POP r25,`$FRAME-$SIZE_T*7`($sp) |
| 370 | $POP r26,`$FRAME-$SIZE_T*6`($sp) |
| 371 | $POP r27,`$FRAME-$SIZE_T*5`($sp) |
| 372 | $POP r28,`$FRAME-$SIZE_T*4`($sp) |
| 373 | $POP r29,`$FRAME-$SIZE_T*3`($sp) |
| 374 | $POP r30,`$FRAME-$SIZE_T*2`($sp) |
| 375 | $POP r31,`$FRAME-$SIZE_T*1`($sp) |
| 376 | mtlr r0 |
| 377 | addi $sp,$sp,$FRAME |
| 378 | blr |
| 379 | .long 0 |
| 380 | .byte 0,12,4,1,0x80,18,1,0 |
| 381 | .long 0 |
| 382 | .size KeccakF1600,.-KeccakF1600 |
| 383 | |
| 384 | .type dword_le_load,\@function |
| 385 | .align 5 |
| 386 | dword_le_load: |
| 387 | lbzu r0,1(r3) |
| 388 | lbzu r4,1(r3) |
| 389 | lbzu r5,1(r3) |
| 390 | insrdi r0,r4,8,48 |
| 391 | lbzu r4,1(r3) |
| 392 | insrdi r0,r5,8,40 |
| 393 | lbzu r5,1(r3) |
| 394 | insrdi r0,r4,8,32 |
| 395 | lbzu r4,1(r3) |
| 396 | insrdi r0,r5,8,24 |
| 397 | lbzu r5,1(r3) |
| 398 | insrdi r0,r4,8,16 |
| 399 | lbzu r4,1(r3) |
| 400 | insrdi r0,r5,8,8 |
| 401 | insrdi r0,r4,8,0 |
| 402 | blr |
| 403 | .long 0 |
| 404 | .byte 0,12,0x14,0,0,0,1,0 |
| 405 | .long 0 |
| 406 | .size dword_le_load,.-dword_le_load |
| 407 | |
| 408 | .globl SHA3_absorb |
| 409 | .type SHA3_absorb,\@function |
| 410 | .align 5 |
| 411 | SHA3_absorb: |
| 412 | $STU $sp,-$FRAME($sp) |
| 413 | mflr r0 |
| 414 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| 415 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
| 416 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
| 417 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
| 418 | $PUSH r18,`$FRAME-$SIZE_T*14`($sp) |
| 419 | $PUSH r19,`$FRAME-$SIZE_T*13`($sp) |
| 420 | $PUSH r20,`$FRAME-$SIZE_T*12`($sp) |
| 421 | $PUSH r21,`$FRAME-$SIZE_T*11`($sp) |
| 422 | $PUSH r22,`$FRAME-$SIZE_T*10`($sp) |
| 423 | $PUSH r23,`$FRAME-$SIZE_T*9`($sp) |
| 424 | $PUSH r24,`$FRAME-$SIZE_T*8`($sp) |
| 425 | $PUSH r25,`$FRAME-$SIZE_T*7`($sp) |
| 426 | $PUSH r26,`$FRAME-$SIZE_T*6`($sp) |
| 427 | $PUSH r27,`$FRAME-$SIZE_T*5`($sp) |
| 428 | $PUSH r28,`$FRAME-$SIZE_T*4`($sp) |
| 429 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 430 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 431 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 432 | $PUSH r0,`$FRAME+$LRSAVE`($sp) |
| 433 | |
| 434 | bl PICmeup |
| 435 | subi r4,r4,1 ; prepare for lbzu |
| 436 | subi r12,r12,8 ; prepare for ldu |
| 437 | |
| 438 | $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][] |
| 439 | $PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp |
| 440 | $PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len |
| 441 | $PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz |
| 442 | mr r0,r6 |
| 443 | $PUSH r12,`$LOCALS+4*$SIZE_T`($sp) |
| 444 | |
| 445 | ld $A[0][0],`8*0`(r3) ; load A[5][5] |
| 446 | ld $A[0][1],`8*1`(r3) |
| 447 | ld $A[0][2],`8*2`(r3) |
| 448 | ld $A[0][3],`8*3`(r3) |
| 449 | ld $A[0][4],`8*4`(r3) |
| 450 | ld $A[1][0],`8*5`(r3) |
| 451 | ld $A[1][1],`8*6`(r3) |
| 452 | ld $A[1][2],`8*7`(r3) |
| 453 | ld $A[1][3],`8*8`(r3) |
| 454 | ld $A[1][4],`8*9`(r3) |
| 455 | ld $A[2][0],`8*10`(r3) |
| 456 | ld $A[2][1],`8*11`(r3) |
| 457 | ld $A[2][2],`8*12`(r3) |
| 458 | ld $A[2][3],`8*13`(r3) |
| 459 | ld $A[2][4],`8*14`(r3) |
| 460 | ld $A[3][0],`8*15`(r3) |
| 461 | ld $A[3][1],`8*16`(r3) |
| 462 | ld $A[3][2],`8*17`(r3) |
| 463 | ld $A[3][3],`8*18`(r3) |
| 464 | ld $A[3][4],`8*19`(r3) |
| 465 | ld $A[4][0],`8*20`(r3) |
| 466 | ld $A[4][1],`8*21`(r3) |
| 467 | ld $A[4][2],`8*22`(r3) |
| 468 | ld $A[4][3],`8*23`(r3) |
| 469 | ld $A[4][4],`8*24`(r3) |
| 470 | |
| 471 | mr r3,r4 |
| 472 | mr r4,r5 |
| 473 | mr r5,r0 |
| 474 | |
| 475 | b .Loop_absorb |
| 476 | |
| 477 | .align 4 |
| 478 | .Loop_absorb: |
| 479 | $UCMP r4,r5 ; len < bsz? |
| 480 | blt .Labsorbed |
| 481 | |
| 482 | sub r4,r4,r5 ; len -= bsz |
| 483 | srwi r5,r5,3 |
| 484 | $PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len |
| 485 | mtctr r5 |
| 486 | bl dword_le_load ; *inp++ |
| 487 | xor $A[0][0],$A[0][0],r0 |
| 488 | bdz .Lprocess_block |
| 489 | bl dword_le_load ; *inp++ |
| 490 | xor $A[0][1],$A[0][1],r0 |
| 491 | bdz .Lprocess_block |
| 492 | bl dword_le_load ; *inp++ |
| 493 | xor $A[0][2],$A[0][2],r0 |
| 494 | bdz .Lprocess_block |
| 495 | bl dword_le_load ; *inp++ |
| 496 | xor $A[0][3],$A[0][3],r0 |
| 497 | bdz .Lprocess_block |
| 498 | bl dword_le_load ; *inp++ |
| 499 | xor $A[0][4],$A[0][4],r0 |
| 500 | bdz .Lprocess_block |
| 501 | bl dword_le_load ; *inp++ |
| 502 | xor $A[1][0],$A[1][0],r0 |
| 503 | bdz .Lprocess_block |
| 504 | bl dword_le_load ; *inp++ |
| 505 | xor $A[1][1],$A[1][1],r0 |
| 506 | bdz .Lprocess_block |
| 507 | bl dword_le_load ; *inp++ |
| 508 | xor $A[1][2],$A[1][2],r0 |
| 509 | bdz .Lprocess_block |
| 510 | bl dword_le_load ; *inp++ |
| 511 | xor $A[1][3],$A[1][3],r0 |
| 512 | bdz .Lprocess_block |
| 513 | bl dword_le_load ; *inp++ |
| 514 | xor $A[1][4],$A[1][4],r0 |
| 515 | bdz .Lprocess_block |
| 516 | bl dword_le_load ; *inp++ |
| 517 | xor $A[2][0],$A[2][0],r0 |
| 518 | bdz .Lprocess_block |
| 519 | bl dword_le_load ; *inp++ |
| 520 | xor $A[2][1],$A[2][1],r0 |
| 521 | bdz .Lprocess_block |
| 522 | bl dword_le_load ; *inp++ |
| 523 | xor $A[2][2],$A[2][2],r0 |
| 524 | bdz .Lprocess_block |
| 525 | bl dword_le_load ; *inp++ |
| 526 | xor $A[2][3],$A[2][3],r0 |
| 527 | bdz .Lprocess_block |
| 528 | bl dword_le_load ; *inp++ |
| 529 | xor $A[2][4],$A[2][4],r0 |
| 530 | bdz .Lprocess_block |
| 531 | bl dword_le_load ; *inp++ |
| 532 | xor $A[3][0],$A[3][0],r0 |
| 533 | bdz .Lprocess_block |
| 534 | bl dword_le_load ; *inp++ |
| 535 | xor $A[3][1],$A[3][1],r0 |
| 536 | bdz .Lprocess_block |
| 537 | bl dword_le_load ; *inp++ |
| 538 | xor $A[3][2],$A[3][2],r0 |
| 539 | bdz .Lprocess_block |
| 540 | bl dword_le_load ; *inp++ |
| 541 | xor $A[3][3],$A[3][3],r0 |
| 542 | bdz .Lprocess_block |
| 543 | bl dword_le_load ; *inp++ |
| 544 | xor $A[3][4],$A[3][4],r0 |
| 545 | bdz .Lprocess_block |
| 546 | bl dword_le_load ; *inp++ |
| 547 | xor $A[4][0],$A[4][0],r0 |
| 548 | bdz .Lprocess_block |
| 549 | bl dword_le_load ; *inp++ |
| 550 | xor $A[4][1],$A[4][1],r0 |
| 551 | bdz .Lprocess_block |
| 552 | bl dword_le_load ; *inp++ |
| 553 | xor $A[4][2],$A[4][2],r0 |
| 554 | bdz .Lprocess_block |
| 555 | bl dword_le_load ; *inp++ |
| 556 | xor $A[4][3],$A[4][3],r0 |
| 557 | bdz .Lprocess_block |
| 558 | bl dword_le_load ; *inp++ |
| 559 | xor $A[4][4],$A[4][4],r0 |
| 560 | |
| 561 | .Lprocess_block: |
| 562 | $PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp |
| 563 | |
| 564 | bl KeccakF1600_int |
| 565 | |
| 566 | $POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24] |
| 567 | $POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz |
| 568 | $POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len |
| 569 | $POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp |
| 570 | addic r0,r0,`-8*24` ; rewind iotas |
| 571 | $PUSH r0,`$LOCALS+4*$SIZE_T`($sp) |
| 572 | |
| 573 | b .Loop_absorb |
| 574 | |
| 575 | .align 4 |
| 576 | .Labsorbed: |
| 577 | $POP r3,`$LOCALS+0*$SIZE_T`($sp) |
| 578 | std $A[0][0],`8*0`(r3) ; return A[5][5] |
| 579 | std $A[0][1],`8*1`(r3) |
| 580 | std $A[0][2],`8*2`(r3) |
| 581 | std $A[0][3],`8*3`(r3) |
| 582 | std $A[0][4],`8*4`(r3) |
| 583 | std $A[1][0],`8*5`(r3) |
| 584 | std $A[1][1],`8*6`(r3) |
| 585 | std $A[1][2],`8*7`(r3) |
| 586 | std $A[1][3],`8*8`(r3) |
| 587 | std $A[1][4],`8*9`(r3) |
| 588 | std $A[2][0],`8*10`(r3) |
| 589 | std $A[2][1],`8*11`(r3) |
| 590 | std $A[2][2],`8*12`(r3) |
| 591 | std $A[2][3],`8*13`(r3) |
| 592 | std $A[2][4],`8*14`(r3) |
| 593 | std $A[3][0],`8*15`(r3) |
| 594 | std $A[3][1],`8*16`(r3) |
| 595 | std $A[3][2],`8*17`(r3) |
| 596 | std $A[3][3],`8*18`(r3) |
| 597 | std $A[3][4],`8*19`(r3) |
| 598 | std $A[4][0],`8*20`(r3) |
| 599 | std $A[4][1],`8*21`(r3) |
| 600 | std $A[4][2],`8*22`(r3) |
| 601 | std $A[4][3],`8*23`(r3) |
| 602 | std $A[4][4],`8*24`(r3) |
| 603 | |
| 604 | mr r3,r4 ; return value |
| 605 | $POP r0,`$FRAME+$LRSAVE`($sp) |
| 606 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
| 607 | $POP r15,`$FRAME-$SIZE_T*17`($sp) |
| 608 | $POP r16,`$FRAME-$SIZE_T*16`($sp) |
| 609 | $POP r17,`$FRAME-$SIZE_T*15`($sp) |
| 610 | $POP r18,`$FRAME-$SIZE_T*14`($sp) |
| 611 | $POP r19,`$FRAME-$SIZE_T*13`($sp) |
| 612 | $POP r20,`$FRAME-$SIZE_T*12`($sp) |
| 613 | $POP r21,`$FRAME-$SIZE_T*11`($sp) |
| 614 | $POP r22,`$FRAME-$SIZE_T*10`($sp) |
| 615 | $POP r23,`$FRAME-$SIZE_T*9`($sp) |
| 616 | $POP r24,`$FRAME-$SIZE_T*8`($sp) |
| 617 | $POP r25,`$FRAME-$SIZE_T*7`($sp) |
| 618 | $POP r26,`$FRAME-$SIZE_T*6`($sp) |
| 619 | $POP r27,`$FRAME-$SIZE_T*5`($sp) |
| 620 | $POP r28,`$FRAME-$SIZE_T*4`($sp) |
| 621 | $POP r29,`$FRAME-$SIZE_T*3`($sp) |
| 622 | $POP r30,`$FRAME-$SIZE_T*2`($sp) |
| 623 | $POP r31,`$FRAME-$SIZE_T*1`($sp) |
| 624 | mtlr r0 |
| 625 | addi $sp,$sp,$FRAME |
| 626 | blr |
| 627 | .long 0 |
| 628 | .byte 0,12,4,1,0x80,18,4,0 |
| 629 | .long 0 |
| 630 | .size SHA3_absorb,.-SHA3_absorb |
| 631 | ___ |
| 632 | { |
| 633 | my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31)); |
| 634 | $code.=<<___; |
| 635 | .globl SHA3_squeeze |
| 636 | .type SHA3_squeeze,\@function |
| 637 | .align 5 |
| 638 | SHA3_squeeze: |
| 639 | $STU $sp,`-10*$SIZE_T`($sp) |
| 640 | mflr r0 |
| 641 | $PUSH r28,`6*$SIZE_T`($sp) |
| 642 | $PUSH r29,`7*$SIZE_T`($sp) |
| 643 | $PUSH r30,`8*$SIZE_T`($sp) |
| 644 | $PUSH r31,`9*$SIZE_T`($sp) |
| 645 | $PUSH r0,`10*$SIZE_T+$LRSAVE`($sp) |
| 646 | |
| 647 | mr $A_flat,r3 |
| 648 | subi r3,r3,8 ; prepare for ldu |
| 649 | subi $out,r4,1 ; prepare for stbu |
| 650 | mr $len,r5 |
| 651 | mr $bsz,r6 |
| 652 | b .Loop_squeeze |
| 653 | |
| 654 | .align 4 |
| 655 | .Loop_squeeze: |
| 656 | ldu r0,8(r3) |
| 657 | ${UCMP}i $len,8 |
| 658 | blt .Lsqueeze_tail |
| 659 | |
| 660 | stbu r0,1($out) |
| 661 | srdi r0,r0,8 |
| 662 | stbu r0,1($out) |
| 663 | srdi r0,r0,8 |
| 664 | stbu r0,1($out) |
| 665 | srdi r0,r0,8 |
| 666 | stbu r0,1($out) |
| 667 | srdi r0,r0,8 |
| 668 | stbu r0,1($out) |
| 669 | srdi r0,r0,8 |
| 670 | stbu r0,1($out) |
| 671 | srdi r0,r0,8 |
| 672 | stbu r0,1($out) |
| 673 | srdi r0,r0,8 |
| 674 | stbu r0,1($out) |
| 675 | |
| 676 | subic. $len,$len,8 |
| 677 | beq .Lsqueeze_done |
| 678 | |
| 679 | subic. r6,r6,8 |
| 680 | bgt .Loop_squeeze |
| 681 | |
| 682 | mr r3,$A_flat |
| 683 | bl KeccakF1600 |
| 684 | subi r3,$A_flat,8 ; prepare for ldu |
| 685 | mr r6,$bsz |
| 686 | b .Loop_squeeze |
| 687 | |
| 688 | .align 4 |
| 689 | .Lsqueeze_tail: |
| 690 | mtctr $len |
| 691 | .Loop_tail: |
| 692 | stbu r0,1($out) |
| 693 | srdi r0,r0,8 |
| 694 | bdnz .Loop_tail |
| 695 | |
| 696 | .Lsqueeze_done: |
| 697 | $POP r0,`10*$SIZE_T+$LRSAVE`($sp) |
| 698 | $POP r28,`6*$SIZE_T`($sp) |
| 699 | $POP r29,`7*$SIZE_T`($sp) |
| 700 | $POP r30,`8*$SIZE_T`($sp) |
| 701 | $POP r31,`9*$SIZE_T`($sp) |
| 702 | mtlr r0 |
| 703 | addi $sp,$sp,`10*$SIZE_T` |
| 704 | blr |
| 705 | .long 0 |
| 706 | .byte 0,12,4,1,0x80,4,4,0 |
| 707 | .long 0 |
| 708 | .size SHA3_squeeze,.-SHA3_squeeze |
| 709 | ___ |
| 710 | } |
| 711 | |
| 712 | # Ugly hack here, because PPC assembler syntax seem to vary too |
| 713 | # much from platforms to platform... |
| 714 | $code.=<<___; |
| 715 | .align 6 |
| 716 | PICmeup: |
| 717 | mflr r0 |
| 718 | bcl 20,31,\$+4 |
| 719 | mflr r12 ; vvvvvv "distance" between . and 1st data entry |
| 720 | addi r12,r12,`64-8` |
| 721 | mtlr r0 |
| 722 | blr |
| 723 | .long 0 |
| 724 | .byte 0,12,0x14,0,0,0,0,0 |
| 725 | .space `64-9*4` |
| 726 | .type iotas,\@object |
| 727 | iotas: |
| 728 | .quad 0x0000000000000001 |
| 729 | .quad 0x0000000000008082 |
| 730 | .quad 0x800000000000808a |
| 731 | .quad 0x8000000080008000 |
| 732 | .quad 0x000000000000808b |
| 733 | .quad 0x0000000080000001 |
| 734 | .quad 0x8000000080008081 |
| 735 | .quad 0x8000000000008009 |
| 736 | .quad 0x000000000000008a |
| 737 | .quad 0x0000000000000088 |
| 738 | .quad 0x0000000080008009 |
| 739 | .quad 0x000000008000000a |
| 740 | .quad 0x000000008000808b |
| 741 | .quad 0x800000000000008b |
| 742 | .quad 0x8000000000008089 |
| 743 | .quad 0x8000000000008003 |
| 744 | .quad 0x8000000000008002 |
| 745 | .quad 0x8000000000000080 |
| 746 | .quad 0x000000000000800a |
| 747 | .quad 0x800000008000000a |
| 748 | .quad 0x8000000080008081 |
| 749 | .quad 0x8000000000008080 |
| 750 | .quad 0x0000000080000001 |
| 751 | .quad 0x8000000080008008 |
| 752 | .size iotas,.-iotas |
| 753 | .asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>" |
| 754 | ___ |
| 755 | |
| 756 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| 757 | print $code; |
| 758 | close STDOUT or die "error closing STDOUT: $!"; |