yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #!/usr/bin/env perl |
| 2 | # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | # |
| 9 | # ==================================================================== |
| 10 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 11 | # project. The module is, however, dual licensed under OpenSSL and |
| 12 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 13 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 14 | # ==================================================================== |
| 15 | # |
| 16 | # Keccak-1600 for PowerISA 2.07. |
| 17 | # |
| 18 | # June 2017. |
| 19 | # |
| 20 | # This is straightforward KECCAK_1X_ALT SIMD implementation, but with |
| 21 | # disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral. |
| 22 | # POWER8 processor spends 9.8 cycles to process byte out of large |
| 23 | # buffer for r=1088, which matches SHA3-256. This is 17% better than |
| 24 | # scalar PPC64 code. It probably should be noted that if POWER8's |
| 25 | # successor can achieve higher scalar instruction issue rate, then |
| 26 | # this module will loose... And it does on POWER9 with 12.0 vs. 9.4. |
| 27 | |
| 28 | $flavour = shift; |
| 29 | |
| 30 | if ($flavour =~ /64/) { |
| 31 | $SIZE_T =8; |
| 32 | $LRSAVE =2*$SIZE_T; |
| 33 | $UCMP ="cmpld"; |
| 34 | $STU ="stdu"; |
| 35 | $POP ="ld"; |
| 36 | $PUSH ="std"; |
| 37 | } elsif ($flavour =~ /32/) { |
| 38 | $SIZE_T =4; |
| 39 | $LRSAVE =$SIZE_T; |
| 40 | $STU ="stwu"; |
| 41 | $POP ="lwz"; |
| 42 | $PUSH ="stw"; |
| 43 | $UCMP ="cmplw"; |
| 44 | } else { die "nonsense $flavour"; } |
| 45 | |
| 46 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 47 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 48 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| 49 | die "can't locate ppc-xlate.pl"; |
| 50 | |
| 51 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 52 | |
| 53 | $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload |
| 54 | |
| 55 | my $sp ="r1"; |
| 56 | |
| 57 | my $iotas = "r12"; |
| 58 | |
| 59 | ######################################################################## |
| 60 | # Register layout: |
| 61 | # |
| 62 | # v0 A[0][0] A[1][0] |
| 63 | # v1 A[0][1] A[1][1] |
| 64 | # v2 A[0][2] A[1][2] |
| 65 | # v3 A[0][3] A[1][3] |
| 66 | # v4 A[0][4] A[1][4] |
| 67 | # |
| 68 | # v5 A[2][0] A[3][0] |
| 69 | # v6 A[2][1] A[3][1] |
| 70 | # v7 A[2][2] A[3][2] |
| 71 | # v8 A[2][3] A[3][3] |
| 72 | # v9 A[2][4] A[3][4] |
| 73 | # |
| 74 | # v10 A[4][0] A[4][1] |
| 75 | # v11 A[4][2] A[4][3] |
| 76 | # v12 A[4][4] A[4][4] |
| 77 | # |
| 78 | # v13..25 rhotates[][] |
| 79 | # v26..31 volatile |
| 80 | # |
| 81 | $code.=<<___; |
| 82 | .machine "any" |
| 83 | .text |
| 84 | |
| 85 | .type KeccakF1600_int,\@function |
| 86 | .align 5 |
| 87 | KeccakF1600_int: |
| 88 | li r0,24 |
| 89 | mtctr r0 |
| 90 | li r0,0 |
| 91 | b .Loop |
| 92 | |
| 93 | .align 4 |
| 94 | .Loop: |
| 95 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta |
| 96 | vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0] |
| 97 | vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1] |
| 98 | vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2] |
| 99 | vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3] |
| 100 | vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4] |
| 101 | vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1] |
| 102 | vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1] |
| 103 | vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3] |
| 104 | vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3] |
| 105 | vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4] |
| 106 | vxor v26,v26,v31 ; C[0..1] |
| 107 | vxor v27,v27,v28 ; C[2..3] |
| 108 | vxor v28,v29,v30 ; C[4..4] |
| 109 | vspltisb v31,1 |
| 110 | vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1] |
| 111 | vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3] |
| 112 | vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low! |
| 113 | |
| 114 | vrld v29,v26,v31 ; ROL64(C[0..1],1) |
| 115 | vrld v30,v27,v31 ; ROL64(C[2..3],1) |
| 116 | vrld v31,v28,v31 ; ROL64(C[4..4],1) |
| 117 | vpermdi v31,v31,v29,0b10 |
| 118 | vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1) |
| 119 | vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1) |
| 120 | vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low! |
| 121 | |
| 122 | vpermdi v29,v26,v26,0b00 ; C[0..0] |
| 123 | vpermdi v30,v28,v26,0b10 ; C[4..0] |
| 124 | vpermdi v31,v28,v28,0b11 ; C[4..4] |
| 125 | vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0] |
| 126 | vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0] |
| 127 | vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0] |
| 128 | vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4] |
| 129 | vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4] |
| 130 | |
| 131 | vpermdi v29,v27,v27,0b00 ; C[2..2] |
| 132 | vpermdi v30,v26,v26,0b11 ; C[1..1] |
| 133 | vpermdi v31,v26,v27,0b10 ; C[1..2] |
| 134 | vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2] |
| 135 | vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2] |
| 136 | vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1] |
| 137 | vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1] |
| 138 | vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2] |
| 139 | |
| 140 | vpermdi v29,v27,v27,0b11 ; C[3..3] |
| 141 | vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3] |
| 142 | vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3] |
| 143 | vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3] |
| 144 | |
| 145 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho |
| 146 | vrld v26,v0, v13 ; v0 |
| 147 | vrld v1, v1, v14 |
| 148 | vrld v27,v2, v15 ; v2 |
| 149 | vrld v28,v3, v16 ; v3 |
| 150 | vrld v4, v4, v17 |
| 151 | vrld v5, v5, v18 |
| 152 | vrld v6, v6, v19 |
| 153 | vrld v29,v7, v20 ; v7 |
| 154 | vrld v8, v8, v21 |
| 155 | vrld v9, v9, v22 |
| 156 | vrld v10,v10,v23 |
| 157 | vrld v30,v11,v24 ; v11 |
| 158 | vrld v12,v12,v25 |
| 159 | |
| 160 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi |
| 161 | vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3] |
| 162 | vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0] |
| 163 | vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0] |
| 164 | vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4] |
| 165 | vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4] |
| 166 | vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1] |
| 167 | vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2] |
| 168 | vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1] |
| 169 | vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0] |
| 170 | vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2] |
| 171 | vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1] |
| 172 | vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3] |
| 173 | vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3] |
| 174 | |
| 175 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota |
| 176 | lvx_u v31,$iotas,r0 ; iotas[index] |
| 177 | addic r0,r0,16 ; index++ |
| 178 | |
| 179 | vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2]) |
| 180 | vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3]) |
| 181 | vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4]) |
| 182 | vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0]) |
| 183 | vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1]) |
| 184 | vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2]) |
| 185 | vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3]) |
| 186 | vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) |
| 187 | vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) |
| 188 | vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) |
| 189 | |
| 190 | vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2]) |
| 191 | vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3]) |
| 192 | vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4]) |
| 193 | vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0]) |
| 194 | vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1]) |
| 195 | vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) |
| 196 | vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) |
| 197 | vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) |
| 198 | vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) |
| 199 | vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) |
| 200 | |
| 201 | vxor v0, v0, v31 ; A[0][0] ^= iotas[index++] |
| 202 | |
| 203 | vpermdi v26,v10,v11,0b10 ; A[4][1..2] |
| 204 | vpermdi v27,v12,v10,0b00 ; A[4][4..0] |
| 205 | vpermdi v28,v11,v12,0b10 ; A[4][3..4] |
| 206 | vpermdi v29,v10,v10,0b10 ; A[4][1..0] |
| 207 | vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3]) |
| 208 | vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0]) |
| 209 | vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1]) |
| 210 | vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3]) |
| 211 | vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0]) |
| 212 | vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0]) |
| 213 | |
| 214 | bdnz .Loop |
| 215 | |
| 216 | vpermdi v12,v12,v12,0b11 ; broadcast A[4][4] |
| 217 | blr |
| 218 | .long 0 |
| 219 | .byte 0,12,0x14,0,0,0,0,0 |
| 220 | .size KeccakF1600_int,.-KeccakF1600_int |
| 221 | |
| 222 | .type KeccakF1600,\@function |
| 223 | .align 5 |
| 224 | KeccakF1600: |
| 225 | $STU $sp,-$FRAME($sp) |
| 226 | li r10,`15+6*$SIZE_T` |
| 227 | li r11,`31+6*$SIZE_T` |
| 228 | mflr r8 |
| 229 | mfspr r7, 256 ; save vrsave |
| 230 | stvx v20,r10,$sp |
| 231 | addi r10,r10,32 |
| 232 | stvx v21,r11,$sp |
| 233 | addi r11,r11,32 |
| 234 | stvx v22,r10,$sp |
| 235 | addi r10,r10,32 |
| 236 | stvx v23,r11,$sp |
| 237 | addi r11,r11,32 |
| 238 | stvx v24,r10,$sp |
| 239 | addi r10,r10,32 |
| 240 | stvx v25,r11,$sp |
| 241 | addi r11,r11,32 |
| 242 | stvx v26,r10,$sp |
| 243 | addi r10,r10,32 |
| 244 | stvx v27,r11,$sp |
| 245 | addi r11,r11,32 |
| 246 | stvx v28,r10,$sp |
| 247 | addi r10,r10,32 |
| 248 | stvx v29,r11,$sp |
| 249 | addi r11,r11,32 |
| 250 | stvx v30,r10,$sp |
| 251 | stvx v31,r11,$sp |
| 252 | stw r7,`$FRAME-4`($sp) ; save vrsave |
| 253 | li r0, -1 |
| 254 | $PUSH r8,`$FRAME+$LRSAVE`($sp) |
| 255 | mtspr 256, r0 ; preserve all AltiVec registers |
| 256 | |
| 257 | li r11,16 |
| 258 | lvx_4w v0,0,r3 ; load A[5][5] |
| 259 | li r10,32 |
| 260 | lvx_4w v1,r11,r3 |
| 261 | addi r11,r11,32 |
| 262 | lvx_4w v2,r10,r3 |
| 263 | addi r10,r10,32 |
| 264 | lvx_4w v3,r11,r3 |
| 265 | addi r11,r11,32 |
| 266 | lvx_4w v4,r10,r3 |
| 267 | addi r10,r10,32 |
| 268 | lvx_4w v5,r11,r3 |
| 269 | addi r11,r11,32 |
| 270 | lvx_4w v6,r10,r3 |
| 271 | addi r10,r10,32 |
| 272 | lvx_4w v7,r11,r3 |
| 273 | addi r11,r11,32 |
| 274 | lvx_4w v8,r10,r3 |
| 275 | addi r10,r10,32 |
| 276 | lvx_4w v9,r11,r3 |
| 277 | addi r11,r11,32 |
| 278 | lvx_4w v10,r10,r3 |
| 279 | addi r10,r10,32 |
| 280 | lvx_4w v11,r11,r3 |
| 281 | lvx_splt v12,r10,r3 |
| 282 | |
| 283 | bl PICmeup |
| 284 | |
| 285 | li r11,16 |
| 286 | lvx_u v13,0,r12 ; load rhotates |
| 287 | li r10,32 |
| 288 | lvx_u v14,r11,r12 |
| 289 | addi r11,r11,32 |
| 290 | lvx_u v15,r10,r12 |
| 291 | addi r10,r10,32 |
| 292 | lvx_u v16,r11,r12 |
| 293 | addi r11,r11,32 |
| 294 | lvx_u v17,r10,r12 |
| 295 | addi r10,r10,32 |
| 296 | lvx_u v18,r11,r12 |
| 297 | addi r11,r11,32 |
| 298 | lvx_u v19,r10,r12 |
| 299 | addi r10,r10,32 |
| 300 | lvx_u v20,r11,r12 |
| 301 | addi r11,r11,32 |
| 302 | lvx_u v21,r10,r12 |
| 303 | addi r10,r10,32 |
| 304 | lvx_u v22,r11,r12 |
| 305 | addi r11,r11,32 |
| 306 | lvx_u v23,r10,r12 |
| 307 | addi r10,r10,32 |
| 308 | lvx_u v24,r11,r12 |
| 309 | lvx_u v25,r10,r12 |
| 310 | addi r12,r12,`16*16` ; points at iotas |
| 311 | |
| 312 | bl KeccakF1600_int |
| 313 | |
| 314 | li r11,16 |
| 315 | stvx_4w v0,0,r3 ; return A[5][5] |
| 316 | li r10,32 |
| 317 | stvx_4w v1,r11,r3 |
| 318 | addi r11,r11,32 |
| 319 | stvx_4w v2,r10,r3 |
| 320 | addi r10,r10,32 |
| 321 | stvx_4w v3,r11,r3 |
| 322 | addi r11,r11,32 |
| 323 | stvx_4w v4,r10,r3 |
| 324 | addi r10,r10,32 |
| 325 | stvx_4w v5,r11,r3 |
| 326 | addi r11,r11,32 |
| 327 | stvx_4w v6,r10,r3 |
| 328 | addi r10,r10,32 |
| 329 | stvx_4w v7,r11,r3 |
| 330 | addi r11,r11,32 |
| 331 | stvx_4w v8,r10,r3 |
| 332 | addi r10,r10,32 |
| 333 | stvx_4w v9,r11,r3 |
| 334 | addi r11,r11,32 |
| 335 | stvx_4w v10,r10,r3 |
| 336 | addi r10,r10,32 |
| 337 | stvx_4w v11,r11,r3 |
| 338 | stvdx_u v12,r10,r3 |
| 339 | |
| 340 | li r10,`15+6*$SIZE_T` |
| 341 | li r11,`31+6*$SIZE_T` |
| 342 | mtlr r8 |
| 343 | mtspr 256, r7 ; restore vrsave |
| 344 | lvx v20,r10,$sp |
| 345 | addi r10,r10,32 |
| 346 | lvx v21,r11,$sp |
| 347 | addi r11,r11,32 |
| 348 | lvx v22,r10,$sp |
| 349 | addi r10,r10,32 |
| 350 | lvx v23,r11,$sp |
| 351 | addi r11,r11,32 |
| 352 | lvx v24,r10,$sp |
| 353 | addi r10,r10,32 |
| 354 | lvx v25,r11,$sp |
| 355 | addi r11,r11,32 |
| 356 | lvx v26,r10,$sp |
| 357 | addi r10,r10,32 |
| 358 | lvx v27,r11,$sp |
| 359 | addi r11,r11,32 |
| 360 | lvx v28,r10,$sp |
| 361 | addi r10,r10,32 |
| 362 | lvx v29,r11,$sp |
| 363 | addi r11,r11,32 |
| 364 | lvx v30,r10,$sp |
| 365 | lvx v31,r11,$sp |
| 366 | addi $sp,$sp,$FRAME |
| 367 | blr |
| 368 | .long 0 |
| 369 | .byte 0,12,0x04,1,0x80,0,1,0 |
| 370 | .long 0 |
| 371 | .size KeccakF1600,.-KeccakF1600 |
| 372 | ___ |
| 373 | { |
| 374 | my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6)); |
| 375 | |
| 376 | $code.=<<___; |
| 377 | .globl SHA3_absorb |
| 378 | .type SHA3_absorb,\@function |
| 379 | .align 5 |
| 380 | SHA3_absorb: |
| 381 | $STU $sp,-$FRAME($sp) |
| 382 | li r10,`15+6*$SIZE_T` |
| 383 | li r11,`31+6*$SIZE_T` |
| 384 | mflr r8 |
| 385 | mfspr r7, 256 ; save vrsave |
| 386 | stvx v20,r10,$sp |
| 387 | addi r10,r10,32 |
| 388 | stvx v21,r11,$sp |
| 389 | addi r11,r11,32 |
| 390 | stvx v22,r10,$sp |
| 391 | addi r10,r10,32 |
| 392 | stvx v23,r11,$sp |
| 393 | addi r11,r11,32 |
| 394 | stvx v24,r10,$sp |
| 395 | addi r10,r10,32 |
| 396 | stvx v25,r11,$sp |
| 397 | addi r11,r11,32 |
| 398 | stvx v26,r10,$sp |
| 399 | addi r10,r10,32 |
| 400 | stvx v27,r11,$sp |
| 401 | addi r11,r11,32 |
| 402 | stvx v28,r10,$sp |
| 403 | addi r10,r10,32 |
| 404 | stvx v29,r11,$sp |
| 405 | addi r11,r11,32 |
| 406 | stvx v30,r10,$sp |
| 407 | stvx v31,r11,$sp |
| 408 | stw r7,`$FRAME-4`($sp) ; save vrsave |
| 409 | li r0, -1 |
| 410 | $PUSH r8,`$FRAME+$LRSAVE`($sp) |
| 411 | mtspr 256, r0 ; preserve all AltiVec registers |
| 412 | |
| 413 | li r11,16 |
| 414 | lvx_4w v0,0,$A_jagged ; load A[5][5] |
| 415 | li r10,32 |
| 416 | lvx_4w v1,r11,$A_jagged |
| 417 | addi r11,r11,32 |
| 418 | lvx_4w v2,r10,$A_jagged |
| 419 | addi r10,r10,32 |
| 420 | lvx_4w v3,r11,$A_jagged |
| 421 | addi r11,r11,32 |
| 422 | lvx_4w v4,r10,$A_jagged |
| 423 | addi r10,r10,32 |
| 424 | lvx_4w v5,r11,$A_jagged |
| 425 | addi r11,r11,32 |
| 426 | lvx_4w v6,r10,$A_jagged |
| 427 | addi r10,r10,32 |
| 428 | lvx_4w v7,r11,$A_jagged |
| 429 | addi r11,r11,32 |
| 430 | lvx_4w v8,r10,$A_jagged |
| 431 | addi r10,r10,32 |
| 432 | lvx_4w v9,r11,$A_jagged |
| 433 | addi r11,r11,32 |
| 434 | lvx_4w v10,r10,$A_jagged |
| 435 | addi r10,r10,32 |
| 436 | lvx_4w v11,r11,$A_jagged |
| 437 | lvx_splt v12,r10,$A_jagged |
| 438 | |
| 439 | bl PICmeup |
| 440 | |
| 441 | li r11,16 |
| 442 | lvx_u v13,0,r12 ; load rhotates |
| 443 | li r10,32 |
| 444 | lvx_u v14,r11,r12 |
| 445 | addi r11,r11,32 |
| 446 | lvx_u v15,r10,r12 |
| 447 | addi r10,r10,32 |
| 448 | lvx_u v16,r11,r12 |
| 449 | addi r11,r11,32 |
| 450 | lvx_u v17,r10,r12 |
| 451 | addi r10,r10,32 |
| 452 | lvx_u v18,r11,r12 |
| 453 | addi r11,r11,32 |
| 454 | lvx_u v19,r10,r12 |
| 455 | addi r10,r10,32 |
| 456 | lvx_u v20,r11,r12 |
| 457 | addi r11,r11,32 |
| 458 | lvx_u v21,r10,r12 |
| 459 | addi r10,r10,32 |
| 460 | lvx_u v22,r11,r12 |
| 461 | addi r11,r11,32 |
| 462 | lvx_u v23,r10,r12 |
| 463 | addi r10,r10,32 |
| 464 | lvx_u v24,r11,r12 |
| 465 | lvx_u v25,r10,r12 |
| 466 | li r10,-32 |
| 467 | li r11,-16 |
| 468 | addi r12,r12,`16*16` ; points at iotas |
| 469 | b .Loop_absorb |
| 470 | |
| 471 | .align 4 |
| 472 | .Loop_absorb: |
| 473 | $UCMP $len,$bsz ; len < bsz? |
| 474 | blt .Labsorbed |
| 475 | |
| 476 | sub $len,$len,$bsz ; len -= bsz |
| 477 | srwi r0,$bsz,3 |
| 478 | mtctr r0 |
| 479 | |
| 480 | lvx_u v30,r10,r12 ; permutation masks |
| 481 | lvx_u v31,r11,r12 |
| 482 | ?vspltisb v27,7 ; prepare masks for byte swap |
| 483 | ?vxor v30,v30,v27 ; on big-endian |
| 484 | ?vxor v31,v31,v27 |
| 485 | |
| 486 | vxor v27,v27,v27 ; zero |
| 487 | lvdx_u v26,0,$inp |
| 488 | addi $inp,$inp,8 |
| 489 | vperm v26,v26,v27,v30 |
| 490 | vxor v0, v0, v26 |
| 491 | bdz .Lprocess_block |
| 492 | lvdx_u v26,0,$inp |
| 493 | addi $inp,$inp,8 |
| 494 | vperm v26,v26,v27,v30 |
| 495 | vxor v1, v1, v26 |
| 496 | bdz .Lprocess_block |
| 497 | lvdx_u v26,0,$inp |
| 498 | addi $inp,$inp,8 |
| 499 | vperm v26,v26,v27,v30 |
| 500 | vxor v2, v2, v26 |
| 501 | bdz .Lprocess_block |
| 502 | lvdx_u v26,0,$inp |
| 503 | addi $inp,$inp,8 |
| 504 | vperm v26,v26,v27,v30 |
| 505 | vxor v3, v3, v26 |
| 506 | bdz .Lprocess_block |
| 507 | lvdx_u v26,0,$inp |
| 508 | addi $inp,$inp,8 |
| 509 | vperm v26,v26,v27,v30 |
| 510 | vxor v4, v4, v26 |
| 511 | bdz .Lprocess_block |
| 512 | lvdx_u v26,0,$inp |
| 513 | addi $inp,$inp,8 |
| 514 | vperm v26,v26,v27,v31 |
| 515 | vxor v0, v0, v26 |
| 516 | bdz .Lprocess_block |
| 517 | lvdx_u v26,0,$inp |
| 518 | addi $inp,$inp,8 |
| 519 | vperm v26,v26,v27,v31 |
| 520 | vxor v1, v1, v26 |
| 521 | bdz .Lprocess_block |
| 522 | lvdx_u v26,0,$inp |
| 523 | addi $inp,$inp,8 |
| 524 | vperm v26,v26,v27,v31 |
| 525 | vxor v2, v2, v26 |
| 526 | bdz .Lprocess_block |
| 527 | lvdx_u v26,0,$inp |
| 528 | addi $inp,$inp,8 |
| 529 | vperm v26,v26,v27,v31 |
| 530 | vxor v3, v3, v26 |
| 531 | bdz .Lprocess_block |
| 532 | lvdx_u v26,0,$inp |
| 533 | addi $inp,$inp,8 |
| 534 | vperm v26,v26,v27,v31 |
| 535 | vxor v4, v4, v26 |
| 536 | bdz .Lprocess_block |
| 537 | lvdx_u v26,0,$inp |
| 538 | addi $inp,$inp,8 |
| 539 | vperm v26,v26,v27,v30 |
| 540 | vxor v5, v5, v26 |
| 541 | bdz .Lprocess_block |
| 542 | lvdx_u v26,0,$inp |
| 543 | addi $inp,$inp,8 |
| 544 | vperm v26,v26,v27,v30 |
| 545 | vxor v6, v6, v26 |
| 546 | bdz .Lprocess_block |
| 547 | lvdx_u v26,0,$inp |
| 548 | addi $inp,$inp,8 |
| 549 | vperm v26,v26,v27,v30 |
| 550 | vxor v7, v7, v26 |
| 551 | bdz .Lprocess_block |
| 552 | lvdx_u v26,0,$inp |
| 553 | addi $inp,$inp,8 |
| 554 | vperm v26,v26,v27,v30 |
| 555 | vxor v8, v8, v26 |
| 556 | bdz .Lprocess_block |
| 557 | lvdx_u v26,0,$inp |
| 558 | addi $inp,$inp,8 |
| 559 | vperm v26,v26,v27,v30 |
| 560 | vxor v9, v9, v26 |
| 561 | bdz .Lprocess_block |
| 562 | lvdx_u v26,0,$inp |
| 563 | addi $inp,$inp,8 |
| 564 | vperm v26,v26,v27,v31 |
| 565 | vxor v5, v5, v26 |
| 566 | bdz .Lprocess_block |
| 567 | lvdx_u v26,0,$inp |
| 568 | addi $inp,$inp,8 |
| 569 | vperm v26,v26,v27,v31 |
| 570 | vxor v6, v6, v26 |
| 571 | bdz .Lprocess_block |
| 572 | lvdx_u v26,0,$inp |
| 573 | addi $inp,$inp,8 |
| 574 | vperm v26,v26,v27,v31 |
| 575 | vxor v7, v7, v26 |
| 576 | bdz .Lprocess_block |
| 577 | lvdx_u v26,0,$inp |
| 578 | addi $inp,$inp,8 |
| 579 | vperm v26,v26,v27,v31 |
| 580 | vxor v8, v8, v26 |
| 581 | bdz .Lprocess_block |
| 582 | lvdx_u v26,0,$inp |
| 583 | addi $inp,$inp,8 |
| 584 | vperm v26,v26,v27,v31 |
| 585 | vxor v9, v9, v26 |
| 586 | bdz .Lprocess_block |
| 587 | lvdx_u v26,0,$inp |
| 588 | addi $inp,$inp,8 |
| 589 | vperm v26,v26,v27,v30 |
| 590 | vxor v10, v10, v26 |
| 591 | bdz .Lprocess_block |
| 592 | lvdx_u v26,0,$inp |
| 593 | addi $inp,$inp,8 |
| 594 | vperm v26,v26,v27,v31 |
| 595 | vxor v10, v10, v26 |
| 596 | bdz .Lprocess_block |
| 597 | lvdx_u v26,0,$inp |
| 598 | addi $inp,$inp,8 |
| 599 | vperm v26,v26,v27,v30 |
| 600 | vxor v11, v11, v26 |
| 601 | bdz .Lprocess_block |
| 602 | lvdx_u v26,0,$inp |
| 603 | addi $inp,$inp,8 |
| 604 | vperm v26,v26,v27,v31 |
| 605 | vxor v11, v11, v26 |
| 606 | bdz .Lprocess_block |
| 607 | lvdx_u v26,0,$inp |
| 608 | addi $inp,$inp,8 |
| 609 | vperm v26,v26,v27,v31 |
| 610 | vxor v12, v12, v26 |
| 611 | |
| 612 | .Lprocess_block: |
| 613 | bl KeccakF1600_int |
| 614 | |
| 615 | b .Loop_absorb |
| 616 | |
| 617 | .align 4 |
| 618 | .Labsorbed: |
| 619 | li r11,16 |
| 620 | stvx_4w v0,0,$A_jagged ; return A[5][5] |
| 621 | li r10,32 |
| 622 | stvx_4w v1,r11,$A_jagged |
| 623 | addi r11,r11,32 |
| 624 | stvx_4w v2,r10,$A_jagged |
| 625 | addi r10,r10,32 |
| 626 | stvx_4w v3,r11,$A_jagged |
| 627 | addi r11,r11,32 |
| 628 | stvx_4w v4,r10,$A_jagged |
| 629 | addi r10,r10,32 |
| 630 | stvx_4w v5,r11,$A_jagged |
| 631 | addi r11,r11,32 |
| 632 | stvx_4w v6,r10,$A_jagged |
| 633 | addi r10,r10,32 |
| 634 | stvx_4w v7,r11,$A_jagged |
| 635 | addi r11,r11,32 |
| 636 | stvx_4w v8,r10,$A_jagged |
| 637 | addi r10,r10,32 |
| 638 | stvx_4w v9,r11,$A_jagged |
| 639 | addi r11,r11,32 |
| 640 | stvx_4w v10,r10,$A_jagged |
| 641 | addi r10,r10,32 |
| 642 | stvx_4w v11,r11,$A_jagged |
| 643 | stvdx_u v12,r10,$A_jagged |
| 644 | |
| 645 | mr r3,$len ; return value |
| 646 | li r10,`15+6*$SIZE_T` |
| 647 | li r11,`31+6*$SIZE_T` |
| 648 | mtlr r8 |
| 649 | mtspr 256, r7 ; restore vrsave |
| 650 | lvx v20,r10,$sp |
| 651 | addi r10,r10,32 |
| 652 | lvx v21,r11,$sp |
| 653 | addi r11,r11,32 |
| 654 | lvx v22,r10,$sp |
| 655 | addi r10,r10,32 |
| 656 | lvx v23,r11,$sp |
| 657 | addi r11,r11,32 |
| 658 | lvx v24,r10,$sp |
| 659 | addi r10,r10,32 |
| 660 | lvx v25,r11,$sp |
| 661 | addi r11,r11,32 |
| 662 | lvx v26,r10,$sp |
| 663 | addi r10,r10,32 |
| 664 | lvx v27,r11,$sp |
| 665 | addi r11,r11,32 |
| 666 | lvx v28,r10,$sp |
| 667 | addi r10,r10,32 |
| 668 | lvx v29,r11,$sp |
| 669 | addi r11,r11,32 |
| 670 | lvx v30,r10,$sp |
| 671 | lvx v31,r11,$sp |
| 672 | addi $sp,$sp,$FRAME |
| 673 | blr |
| 674 | .long 0 |
| 675 | .byte 0,12,0x04,1,0x80,0,4,0 |
| 676 | .long 0 |
| 677 | .size SHA3_absorb,.-SHA3_absorb |
| 678 | ___ |
| 679 | } |
| 680 | { |
| 681 | my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6)); |
| 682 | |
| 683 | $code.=<<___; |
| 684 | .globl SHA3_squeeze |
| 685 | .type SHA3_squeeze,\@function |
| 686 | .align 5 |
| 687 | SHA3_squeeze: |
| 688 | mflr r9 ; r9 is not touched by KeccakF1600 |
| 689 | subi $out,$out,1 ; prepare for stbu |
| 690 | addi r8,$A_jagged,4 ; prepare volatiles |
| 691 | mr r10,$bsz |
| 692 | li r11,0 |
| 693 | b .Loop_squeeze |
| 694 | .align 4 |
| 695 | .Loop_squeeze: |
| 696 | lwzx r7,r11,r8 ; lo |
| 697 | lwzx r0,r11,$A_jagged ; hi |
| 698 | ${UCMP}i $len,8 |
| 699 | blt .Lsqueeze_tail |
| 700 | |
| 701 | stbu r7,1($out) ; write lo |
| 702 | srwi r7,r7,8 |
| 703 | stbu r7,1($out) |
| 704 | srwi r7,r7,8 |
| 705 | stbu r7,1($out) |
| 706 | srwi r7,r7,8 |
| 707 | stbu r7,1($out) |
| 708 | stbu r0,1($out) ; write hi |
| 709 | srwi r0,r0,8 |
| 710 | stbu r0,1($out) |
| 711 | srwi r0,r0,8 |
| 712 | stbu r0,1($out) |
| 713 | srwi r0,r0,8 |
| 714 | stbu r0,1($out) |
| 715 | |
| 716 | subic. $len,$len,8 |
| 717 | beqlr ; return if done |
| 718 | |
| 719 | subic. r10,r10,8 |
| 720 | ble .Loutput_expand |
| 721 | |
| 722 | addi r11,r11,16 ; calculate jagged index |
| 723 | cmplwi r11,`16*5` |
| 724 | blt .Loop_squeeze |
| 725 | subi r11,r11,72 |
| 726 | beq .Loop_squeeze |
| 727 | addi r11,r11,72 |
| 728 | cmplwi r11,`16*5+8` |
| 729 | subi r11,r11,8 |
| 730 | beq .Loop_squeeze |
| 731 | addi r11,r11,8 |
| 732 | cmplwi r11,`16*10` |
| 733 | subi r11,r11,72 |
| 734 | beq .Loop_squeeze |
| 735 | addi r11,r11,72 |
| 736 | blt .Loop_squeeze |
| 737 | subi r11,r11,8 |
| 738 | b .Loop_squeeze |
| 739 | |
| 740 | .align 4 |
| 741 | .Loutput_expand: |
| 742 | bl KeccakF1600 |
| 743 | mtlr r9 |
| 744 | |
| 745 | addi r8,$A_jagged,4 ; restore volatiles |
| 746 | mr r10,$bsz |
| 747 | li r11,0 |
| 748 | b .Loop_squeeze |
| 749 | |
| 750 | .align 4 |
| 751 | .Lsqueeze_tail: |
| 752 | mtctr $len |
| 753 | subic. $len,$len,4 |
| 754 | ble .Loop_tail_lo |
| 755 | li r8,4 |
| 756 | mtctr r8 |
| 757 | .Loop_tail_lo: |
| 758 | stbu r7,1($out) |
| 759 | srdi r7,r7,8 |
| 760 | bdnz .Loop_tail_lo |
| 761 | ble .Lsqueeze_done |
| 762 | mtctr $len |
| 763 | .Loop_tail_hi: |
| 764 | stbu r0,1($out) |
| 765 | srdi r0,r0,8 |
| 766 | bdnz .Loop_tail_hi |
| 767 | |
| 768 | .Lsqueeze_done: |
| 769 | blr |
| 770 | .long 0 |
| 771 | .byte 0,12,0x14,0,0,0,4,0 |
| 772 | .long 0 |
| 773 | .size SHA3_squeeze,.-SHA3_squeeze |
| 774 | ___ |
| 775 | } |
| 776 | $code.=<<___; |
| 777 | .align 6 |
| 778 | PICmeup: |
| 779 | mflr r0 |
| 780 | bcl 20,31,\$+4 |
| 781 | mflr r12 ; vvvvvv "distance" between . and 1st data entry |
| 782 | addi r12,r12,`64-8` |
| 783 | mtlr r0 |
| 784 | blr |
| 785 | .long 0 |
| 786 | .byte 0,12,0x14,0,0,0,0,0 |
| 787 | .space `64-9*4` |
| 788 | .type rhotates,\@object |
| 789 | .align 6 |
| 790 | rhotates: |
| 791 | .quad 0, 36 |
| 792 | .quad 1, 44 |
| 793 | .quad 62, 6 |
| 794 | .quad 28, 55 |
| 795 | .quad 27, 20 |
| 796 | .quad 3, 41 |
| 797 | .quad 10, 45 |
| 798 | .quad 43, 15 |
| 799 | .quad 25, 21 |
| 800 | .quad 39, 8 |
| 801 | .quad 18, 2 |
| 802 | .quad 61, 56 |
| 803 | .quad 14, 14 |
| 804 | .size rhotates,.-rhotates |
| 805 | .quad 0,0 |
| 806 | .quad 0x0001020304050607,0x1011121314151617 |
| 807 | .quad 0x1011121314151617,0x0001020304050607 |
| 808 | .type iotas,\@object |
| 809 | iotas: |
| 810 | .quad 0x0000000000000001,0 |
| 811 | .quad 0x0000000000008082,0 |
| 812 | .quad 0x800000000000808a,0 |
| 813 | .quad 0x8000000080008000,0 |
| 814 | .quad 0x000000000000808b,0 |
| 815 | .quad 0x0000000080000001,0 |
| 816 | .quad 0x8000000080008081,0 |
| 817 | .quad 0x8000000000008009,0 |
| 818 | .quad 0x000000000000008a,0 |
| 819 | .quad 0x0000000000000088,0 |
| 820 | .quad 0x0000000080008009,0 |
| 821 | .quad 0x000000008000000a,0 |
| 822 | .quad 0x000000008000808b,0 |
| 823 | .quad 0x800000000000008b,0 |
| 824 | .quad 0x8000000000008089,0 |
| 825 | .quad 0x8000000000008003,0 |
| 826 | .quad 0x8000000000008002,0 |
| 827 | .quad 0x8000000000000080,0 |
| 828 | .quad 0x000000000000800a,0 |
| 829 | .quad 0x800000008000000a,0 |
| 830 | .quad 0x8000000080008081,0 |
| 831 | .quad 0x8000000000008080,0 |
| 832 | .quad 0x0000000080000001,0 |
| 833 | .quad 0x8000000080008008,0 |
| 834 | .size iotas,.-iotas |
| 835 | .asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" |
| 836 | ___ |
| 837 | |
| 838 | foreach (split("\n",$code)) { |
| 839 | s/\`([^\`]*)\`/eval $1/ge; |
| 840 | |
| 841 | if ($flavour =~ /le$/) { # little-endian |
| 842 | s/\?([a-z]+)/;$1/; |
| 843 | } else { # big-endian |
| 844 | s/\?([a-z]+)/$1/; |
| 845 | } |
| 846 | |
| 847 | print $_,"\n"; |
| 848 | } |
| 849 | |
| 850 | close STDOUT or die "error closing STDOUT: $!"; |