yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | |
| 17 | # Poly1305 hash for MIPS64. |
| 18 | # |
| 19 | # May 2016 |
| 20 | # |
| 21 | # Numbers are cycles per processed byte with poly1305_blocks alone. |
| 22 | # |
| 23 | # IALU/gcc |
| 24 | # R1x000 5.64/+120% (big-endian) |
| 25 | # Octeon II 3.80/+280% (little-endian) |
| 26 | |
| 27 | ###################################################################### |
| 28 | # There is a number of MIPS ABI in use, O32 and N32/64 are most |
| 29 | # widely used. Then there is a new contender: NUBI. It appears that if |
| 30 | # one picks the latter, it's possible to arrange code in ABI neutral |
| 31 | # manner. Therefore let's stick to NUBI register layout: |
| 32 | # |
| 33 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); |
| 34 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); |
| 35 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); |
| 36 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); |
| 37 | # |
| 38 | # The return value is placed in $a0. Following coding rules facilitate |
| 39 | # interoperability: |
| 40 | # |
| 41 | # - never ever touch $tp, "thread pointer", former $gp [o32 can be |
| 42 | # excluded from the rule, because it's specified volatile]; |
| 43 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting |
| 44 | # old code]; |
| 45 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; |
| 46 | # |
| 47 | # For reference here is register layout for N32/64 MIPS ABIs: |
| 48 | # |
| 49 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); |
| 50 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); |
| 51 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); |
| 52 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); |
| 53 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); |
| 54 | # |
| 55 | # <appro@openssl.org> |
| 56 | # |
| 57 | ###################################################################### |
| 58 | |
| 59 | $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 |
| 60 | |
| 61 | die "MIPS64 only" unless ($flavour =~ /64|n32/i); |
| 62 | |
| 63 | $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; |
| 64 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; |
| 65 | |
| 66 | ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); |
| 67 | ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); |
| 68 | |
| 69 | $code.=<<___; |
| 70 | #include "mips_arch.h" |
| 71 | |
| 72 | #ifdef MIPSEB |
| 73 | # define MSB 0 |
| 74 | # define LSB 7 |
| 75 | #else |
| 76 | # define MSB 7 |
| 77 | # define LSB 0 |
| 78 | #endif |
| 79 | |
| 80 | .text |
| 81 | .set noat |
| 82 | .set noreorder |
| 83 | |
| 84 | .align 5 |
| 85 | .globl poly1305_init |
| 86 | .ent poly1305_init |
| 87 | poly1305_init: |
| 88 | .frame $sp,0,$ra |
| 89 | .set reorder |
| 90 | |
| 91 | sd $zero,0($ctx) |
| 92 | sd $zero,8($ctx) |
| 93 | sd $zero,16($ctx) |
| 94 | |
| 95 | beqz $inp,.Lno_key |
| 96 | |
| 97 | #if defined(_MIPS_ARCH_MIPS64R6) |
| 98 | ld $in0,0($inp) |
| 99 | ld $in1,8($inp) |
| 100 | #else |
| 101 | ldl $in0,0+MSB($inp) |
| 102 | ldl $in1,8+MSB($inp) |
| 103 | ldr $in0,0+LSB($inp) |
| 104 | ldr $in1,8+LSB($inp) |
| 105 | #endif |
| 106 | #ifdef MIPSEB |
| 107 | # if defined(_MIPS_ARCH_MIPS64R2) |
| 108 | dsbh $in0,$in0 # byte swap |
| 109 | dsbh $in1,$in1 |
| 110 | dshd $in0,$in0 |
| 111 | dshd $in1,$in1 |
| 112 | # else |
| 113 | ori $tmp0,$zero,0xFF |
| 114 | dsll $tmp2,$tmp0,32 |
| 115 | or $tmp0,$tmp2 # 0x000000FF000000FF |
| 116 | |
| 117 | and $tmp1,$in0,$tmp0 # byte swap |
| 118 | and $tmp3,$in1,$tmp0 |
| 119 | dsrl $tmp2,$in0,24 |
| 120 | dsrl $tmp4,$in1,24 |
| 121 | dsll $tmp1,24 |
| 122 | dsll $tmp3,24 |
| 123 | and $tmp2,$tmp0 |
| 124 | and $tmp4,$tmp0 |
| 125 | dsll $tmp0,8 # 0x0000FF000000FF00 |
| 126 | or $tmp1,$tmp2 |
| 127 | or $tmp3,$tmp4 |
| 128 | and $tmp2,$in0,$tmp0 |
| 129 | and $tmp4,$in1,$tmp0 |
| 130 | dsrl $in0,8 |
| 131 | dsrl $in1,8 |
| 132 | dsll $tmp2,8 |
| 133 | dsll $tmp4,8 |
| 134 | and $in0,$tmp0 |
| 135 | and $in1,$tmp0 |
| 136 | or $tmp1,$tmp2 |
| 137 | or $tmp3,$tmp4 |
| 138 | or $in0,$tmp1 |
| 139 | or $in1,$tmp3 |
| 140 | dsrl $tmp1,$in0,32 |
| 141 | dsrl $tmp3,$in1,32 |
| 142 | dsll $in0,32 |
| 143 | dsll $in1,32 |
| 144 | or $in0,$tmp1 |
| 145 | or $in1,$tmp3 |
| 146 | # endif |
| 147 | #endif |
| 148 | li $tmp0,1 |
| 149 | dsll $tmp0,32 |
| 150 | daddiu $tmp0,-63 |
| 151 | dsll $tmp0,28 |
| 152 | daddiu $tmp0,-1 # 0ffffffc0fffffff |
| 153 | |
| 154 | and $in0,$tmp0 |
| 155 | daddiu $tmp0,-3 # 0ffffffc0ffffffc |
| 156 | and $in1,$tmp0 |
| 157 | |
| 158 | sd $in0,24($ctx) |
| 159 | dsrl $tmp0,$in1,2 |
| 160 | sd $in1,32($ctx) |
| 161 | daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) |
| 162 | sd $tmp0,40($ctx) |
| 163 | |
| 164 | .Lno_key: |
| 165 | li $v0,0 # return 0 |
| 166 | jr $ra |
| 167 | .end poly1305_init |
| 168 | ___ |
| 169 | { |
| 170 | my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = |
| 171 | ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); |
| 172 | |
| 173 | $code.=<<___; |
| 174 | .align 5 |
| 175 | .globl poly1305_blocks |
| 176 | .ent poly1305_blocks |
| 177 | poly1305_blocks: |
| 178 | .set noreorder |
| 179 | dsrl $len,4 # number of complete blocks |
| 180 | bnez $len,poly1305_blocks_internal |
| 181 | nop |
| 182 | jr $ra |
| 183 | nop |
| 184 | .end poly1305_blocks |
| 185 | |
| 186 | .align 5 |
| 187 | .ent poly1305_blocks_internal |
| 188 | poly1305_blocks_internal: |
| 189 | .frame $sp,6*8,$ra |
| 190 | .mask $SAVED_REGS_MASK,-8 |
| 191 | .set noreorder |
| 192 | dsubu $sp,6*8 |
| 193 | sd $s5,40($sp) |
| 194 | sd $s4,32($sp) |
| 195 | ___ |
| 196 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue |
| 197 | sd $s3,24($sp) |
| 198 | sd $s2,16($sp) |
| 199 | sd $s1,8($sp) |
| 200 | sd $s0,0($sp) |
| 201 | ___ |
| 202 | $code.=<<___; |
| 203 | .set reorder |
| 204 | |
| 205 | ld $h0,0($ctx) # load hash value |
| 206 | ld $h1,8($ctx) |
| 207 | ld $h2,16($ctx) |
| 208 | |
| 209 | ld $r0,24($ctx) # load key |
| 210 | ld $r1,32($ctx) |
| 211 | ld $s1,40($ctx) |
| 212 | |
| 213 | .Loop: |
| 214 | #if defined(_MIPS_ARCH_MIPS64R6) |
| 215 | ld $in0,0($inp) # load input |
| 216 | ld $in1,8($inp) |
| 217 | #else |
| 218 | ldl $in0,0+MSB($inp) # load input |
| 219 | ldl $in1,8+MSB($inp) |
| 220 | ldr $in0,0+LSB($inp) |
| 221 | ldr $in1,8+LSB($inp) |
| 222 | #endif |
| 223 | daddiu $len,-1 |
| 224 | daddiu $inp,16 |
| 225 | #ifdef MIPSEB |
| 226 | # if defined(_MIPS_ARCH_MIPS64R2) |
| 227 | dsbh $in0,$in0 # byte swap |
| 228 | dsbh $in1,$in1 |
| 229 | dshd $in0,$in0 |
| 230 | dshd $in1,$in1 |
| 231 | # else |
| 232 | ori $tmp0,$zero,0xFF |
| 233 | dsll $tmp2,$tmp0,32 |
| 234 | or $tmp0,$tmp2 # 0x000000FF000000FF |
| 235 | |
| 236 | and $tmp1,$in0,$tmp0 # byte swap |
| 237 | and $tmp3,$in1,$tmp0 |
| 238 | dsrl $tmp2,$in0,24 |
| 239 | dsrl $tmp4,$in1,24 |
| 240 | dsll $tmp1,24 |
| 241 | dsll $tmp3,24 |
| 242 | and $tmp2,$tmp0 |
| 243 | and $tmp4,$tmp0 |
| 244 | dsll $tmp0,8 # 0x0000FF000000FF00 |
| 245 | or $tmp1,$tmp2 |
| 246 | or $tmp3,$tmp4 |
| 247 | and $tmp2,$in0,$tmp0 |
| 248 | and $tmp4,$in1,$tmp0 |
| 249 | dsrl $in0,8 |
| 250 | dsrl $in1,8 |
| 251 | dsll $tmp2,8 |
| 252 | dsll $tmp4,8 |
| 253 | and $in0,$tmp0 |
| 254 | and $in1,$tmp0 |
| 255 | or $tmp1,$tmp2 |
| 256 | or $tmp3,$tmp4 |
| 257 | or $in0,$tmp1 |
| 258 | or $in1,$tmp3 |
| 259 | dsrl $tmp1,$in0,32 |
| 260 | dsrl $tmp3,$in1,32 |
| 261 | dsll $in0,32 |
| 262 | dsll $in1,32 |
| 263 | or $in0,$tmp1 |
| 264 | or $in1,$tmp3 |
| 265 | # endif |
| 266 | #endif |
| 267 | daddu $h0,$in0 # accumulate input |
| 268 | daddu $h1,$in1 |
| 269 | sltu $tmp0,$h0,$in0 |
| 270 | sltu $tmp1,$h1,$in1 |
| 271 | daddu $h1,$tmp0 |
| 272 | |
| 273 | dmultu ($r0,$h0) # h0*r0 |
| 274 | daddu $h2,$padbit |
| 275 | sltu $tmp0,$h1,$tmp0 |
| 276 | mflo ($d0,$r0,$h0) |
| 277 | mfhi ($d1,$r0,$h0) |
| 278 | |
| 279 | dmultu ($s1,$h1) # h1*5*r1 |
| 280 | daddu $tmp0,$tmp1 |
| 281 | daddu $h2,$tmp0 |
| 282 | mflo ($tmp0,$s1,$h1) |
| 283 | mfhi ($tmp1,$s1,$h1) |
| 284 | |
| 285 | dmultu ($r1,$h0) # h0*r1 |
| 286 | daddu $d0,$tmp0 |
| 287 | daddu $d1,$tmp1 |
| 288 | mflo ($tmp2,$r1,$h0) |
| 289 | mfhi ($d2,$r1,$h0) |
| 290 | sltu $tmp0,$d0,$tmp0 |
| 291 | daddu $d1,$tmp0 |
| 292 | |
| 293 | dmultu ($r0,$h1) # h1*r0 |
| 294 | daddu $d1,$tmp2 |
| 295 | sltu $tmp2,$d1,$tmp2 |
| 296 | mflo ($tmp0,$r0,$h1) |
| 297 | mfhi ($tmp1,$r0,$h1) |
| 298 | daddu $d2,$tmp2 |
| 299 | |
| 300 | dmultu ($s1,$h2) # h2*5*r1 |
| 301 | daddu $d1,$tmp0 |
| 302 | daddu $d2,$tmp1 |
| 303 | mflo ($tmp2,$s1,$h2) |
| 304 | |
| 305 | dmultu ($r0,$h2) # h2*r0 |
| 306 | sltu $tmp0,$d1,$tmp0 |
| 307 | daddu $d2,$tmp0 |
| 308 | mflo ($tmp3,$r0,$h2) |
| 309 | |
| 310 | daddu $d1,$tmp2 |
| 311 | daddu $d2,$tmp3 |
| 312 | sltu $tmp2,$d1,$tmp2 |
| 313 | daddu $d2,$tmp2 |
| 314 | |
| 315 | li $tmp0,-4 # final reduction |
| 316 | and $tmp0,$d2 |
| 317 | dsrl $tmp1,$d2,2 |
| 318 | andi $h2,$d2,3 |
| 319 | daddu $tmp0,$tmp1 |
| 320 | daddu $h0,$d0,$tmp0 |
| 321 | sltu $tmp0,$h0,$tmp0 |
| 322 | daddu $h1,$d1,$tmp0 |
| 323 | sltu $tmp0,$h1,$tmp0 |
| 324 | daddu $h2,$h2,$tmp0 |
| 325 | |
| 326 | bnez $len,.Loop |
| 327 | |
| 328 | sd $h0,0($ctx) # store hash value |
| 329 | sd $h1,8($ctx) |
| 330 | sd $h2,16($ctx) |
| 331 | |
| 332 | .set noreorder |
| 333 | ld $s5,40($sp) # epilogue |
| 334 | ld $s4,32($sp) |
| 335 | ___ |
| 336 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue |
| 337 | ld $s3,24($sp) |
| 338 | ld $s2,16($sp) |
| 339 | ld $s1,8($sp) |
| 340 | ld $s0,0($sp) |
| 341 | ___ |
| 342 | $code.=<<___; |
| 343 | jr $ra |
| 344 | daddu $sp,6*8 |
| 345 | .end poly1305_blocks_internal |
| 346 | ___ |
| 347 | } |
| 348 | { |
| 349 | my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); |
| 350 | |
| 351 | $code.=<<___; |
| 352 | .align 5 |
| 353 | .globl poly1305_emit |
| 354 | .ent poly1305_emit |
| 355 | poly1305_emit: |
| 356 | .frame $sp,0,$ra |
| 357 | .set reorder |
| 358 | |
| 359 | ld $tmp0,0($ctx) |
| 360 | ld $tmp1,8($ctx) |
| 361 | ld $tmp2,16($ctx) |
| 362 | |
| 363 | daddiu $in0,$tmp0,5 # compare to modulus |
| 364 | sltiu $tmp3,$in0,5 |
| 365 | daddu $in1,$tmp1,$tmp3 |
| 366 | sltu $tmp3,$in1,$tmp3 |
| 367 | daddu $tmp2,$tmp2,$tmp3 |
| 368 | |
| 369 | dsrl $tmp2,2 # see if it carried/borrowed |
| 370 | dsubu $tmp2,$zero,$tmp2 |
| 371 | nor $tmp3,$zero,$tmp2 |
| 372 | |
| 373 | and $in0,$tmp2 |
| 374 | and $tmp0,$tmp3 |
| 375 | and $in1,$tmp2 |
| 376 | and $tmp1,$tmp3 |
| 377 | or $in0,$tmp0 |
| 378 | or $in1,$tmp1 |
| 379 | |
| 380 | lwu $tmp0,0($nonce) # load nonce |
| 381 | lwu $tmp1,4($nonce) |
| 382 | lwu $tmp2,8($nonce) |
| 383 | lwu $tmp3,12($nonce) |
| 384 | dsll $tmp1,32 |
| 385 | dsll $tmp3,32 |
| 386 | or $tmp0,$tmp1 |
| 387 | or $tmp2,$tmp3 |
| 388 | |
| 389 | daddu $in0,$tmp0 # accumulate nonce |
| 390 | daddu $in1,$tmp2 |
| 391 | sltu $tmp0,$in0,$tmp0 |
| 392 | daddu $in1,$tmp0 |
| 393 | |
| 394 | dsrl $tmp0,$in0,8 # write mac value |
| 395 | dsrl $tmp1,$in0,16 |
| 396 | dsrl $tmp2,$in0,24 |
| 397 | sb $in0,0($mac) |
| 398 | dsrl $tmp3,$in0,32 |
| 399 | sb $tmp0,1($mac) |
| 400 | dsrl $tmp0,$in0,40 |
| 401 | sb $tmp1,2($mac) |
| 402 | dsrl $tmp1,$in0,48 |
| 403 | sb $tmp2,3($mac) |
| 404 | dsrl $tmp2,$in0,56 |
| 405 | sb $tmp3,4($mac) |
| 406 | dsrl $tmp3,$in1,8 |
| 407 | sb $tmp0,5($mac) |
| 408 | dsrl $tmp0,$in1,16 |
| 409 | sb $tmp1,6($mac) |
| 410 | dsrl $tmp1,$in1,24 |
| 411 | sb $tmp2,7($mac) |
| 412 | |
| 413 | sb $in1,8($mac) |
| 414 | dsrl $tmp2,$in1,32 |
| 415 | sb $tmp3,9($mac) |
| 416 | dsrl $tmp3,$in1,40 |
| 417 | sb $tmp0,10($mac) |
| 418 | dsrl $tmp0,$in1,48 |
| 419 | sb $tmp1,11($mac) |
| 420 | dsrl $tmp1,$in1,56 |
| 421 | sb $tmp2,12($mac) |
| 422 | sb $tmp3,13($mac) |
| 423 | sb $tmp0,14($mac) |
| 424 | sb $tmp1,15($mac) |
| 425 | |
| 426 | jr $ra |
| 427 | .end poly1305_emit |
| 428 | .rdata |
| 429 | .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>" |
| 430 | .align 2 |
| 431 | ___ |
| 432 | } |
| 433 | |
| 434 | $output=pop and open STDOUT,">$output"; |
| 435 | print $code; |
| 436 | close STDOUT or die "error closing STDOUT: $!"; |
| 437 | |