yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | # |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | # |
| 17 | # February 2012 |
| 18 | # |
| 19 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication |
| 20 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from |
| 21 | # C for the time being... The subroutine runs in 37 cycles, which is |
| 22 | # 4.5x faster than compiler-generated code. Though comparison is |
| 23 | # totally unfair, because this module utilizes Galois Field Multiply |
| 24 | # instruction. |
| 25 | |
| 26 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} |
| 27 | open STDOUT,">$output"; |
| 28 | |
| 29 | ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector |
| 30 | |
| 31 | ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); |
| 32 | ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); |
| 33 | ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); |
| 34 | ($A,$B)=($Alo,$B_1); |
| 35 | $xFF="B1"; |
| 36 | |
| 37 | sub mul_1x1_upper { |
| 38 | my ($A,$B)=@_; |
| 39 | $code.=<<___; |
| 40 | EXTU $B,8,24,$B_2 ; smash $B to 4 bytes |
| 41 | || AND $B,$xFF,$B_0 |
| 42 | || SHRU $B,24,$B_3 |
| 43 | SHRU $A,16, $Ahi ; smash $A to two halfwords |
| 44 | || EXTU $A,16,16,$Alo |
| 45 | |
| 46 | XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication |
| 47 | || XORMPY $Ahi,$B_2,$Ahix2 |
| 48 | || EXTU $B,16,24,$B_1 |
| 49 | XORMPY $Alo,$B_0,$Alox0 |
| 50 | || XORMPY $Ahi,$B_0,$Ahix0 |
| 51 | XORMPY $Alo,$B_3,$Alox3 |
| 52 | || XORMPY $Ahi,$B_3,$Ahix3 |
| 53 | XORMPY $Alo,$B_1,$Alox1 |
| 54 | || XORMPY $Ahi,$B_1,$Ahix1 |
| 55 | ___ |
| 56 | } |
| 57 | sub mul_1x1_merged { |
| 58 | my ($OUTlo,$OUThi,$A,$B)=@_; |
| 59 | $code.=<<___; |
| 60 | EXTU $B,8,24,$B_2 ; smash $B to 4 bytes |
| 61 | || AND $B,$xFF,$B_0 |
| 62 | || SHRU $B,24,$B_3 |
| 63 | SHRU $A,16, $Ahi ; smash $A to two halfwords |
| 64 | || EXTU $A,16,16,$Alo |
| 65 | |
| 66 | XOR $Ahix0,$Alox2,$Ahix0 |
| 67 | || MV $Ahix2,$OUThi |
| 68 | || XORMPY $Alo,$B_2,$Alox2 |
| 69 | XORMPY $Ahi,$B_2,$Ahix2 |
| 70 | || EXTU $B,16,24,$B_1 |
| 71 | || XORMPY $Alo,$B_0,A1 ; $Alox0 |
| 72 | XOR $Ahix1,$Alox3,$Ahix1 |
| 73 | || SHL $Ahix0,16,$OUTlo |
| 74 | || SHRU $Ahix0,16,$Ahix0 |
| 75 | XOR $Alox0,$OUTlo,$OUTlo |
| 76 | || XOR $Ahix0,$OUThi,$OUThi |
| 77 | || XORMPY $Ahi,$B_0,$Ahix0 |
| 78 | || XORMPY $Alo,$B_3,$Alox3 |
| 79 | || SHL $Alox1,8,$Alox1 |
| 80 | || SHL $Ahix3,8,$Ahix3 |
| 81 | XOR $Alox1,$OUTlo,$OUTlo |
| 82 | || XOR $Ahix3,$OUThi,$OUThi |
| 83 | || XORMPY $Ahi,$B_3,$Ahix3 |
| 84 | || SHL $Ahix1,24,$Alox1 |
| 85 | || SHRU $Ahix1,8, $Ahix1 |
| 86 | XOR $Alox1,$OUTlo,$OUTlo |
| 87 | || XOR $Ahix1,$OUThi,$OUThi |
| 88 | || XORMPY $Alo,$B_1,$Alox1 |
| 89 | || XORMPY $Ahi,$B_1,$Ahix1 |
| 90 | || MV A1,$Alox0 |
| 91 | ___ |
| 92 | } |
| 93 | sub mul_1x1_lower { |
| 94 | my ($OUTlo,$OUThi)=@_; |
| 95 | $code.=<<___; |
| 96 | ;NOP |
| 97 | XOR $Ahix0,$Alox2,$Ahix0 |
| 98 | || MV $Ahix2,$OUThi |
| 99 | NOP |
| 100 | XOR $Ahix1,$Alox3,$Ahix1 |
| 101 | || SHL $Ahix0,16,$OUTlo |
| 102 | || SHRU $Ahix0,16,$Ahix0 |
| 103 | XOR $Alox0,$OUTlo,$OUTlo |
| 104 | || XOR $Ahix0,$OUThi,$OUThi |
| 105 | || SHL $Alox1,8,$Alox1 |
| 106 | || SHL $Ahix3,8,$Ahix3 |
| 107 | XOR $Alox1,$OUTlo,$OUTlo |
| 108 | || XOR $Ahix3,$OUThi,$OUThi |
| 109 | || SHL $Ahix1,24,$Alox1 |
| 110 | || SHRU $Ahix1,8, $Ahix1 |
| 111 | XOR $Alox1,$OUTlo,$OUTlo |
| 112 | || XOR $Ahix1,$OUThi,$OUThi |
| 113 | ___ |
| 114 | } |
| 115 | $code.=<<___; |
| 116 | .text |
| 117 | |
| 118 | .if .ASSEMBLER_VERSION<7000000 |
| 119 | .asg 0,__TI_EABI__ |
| 120 | .endif |
| 121 | .if __TI_EABI__ |
| 122 | .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 |
| 123 | .endif |
| 124 | |
| 125 | .global _bn_GF2m_mul_2x2 |
| 126 | _bn_GF2m_mul_2x2: |
| 127 | .asmfunc |
| 128 | MVK 0xFF,$xFF |
| 129 | ___ |
| 130 | &mul_1x1_upper($a0,$b0); # a0·b0 |
| 131 | $code.=<<___; |
| 132 | || MV $b1,$B |
| 133 | MV $a1,$A |
| 134 | ___ |
| 135 | &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 |
| 136 | $code.=<<___; |
| 137 | || XOR $b0,$b1,$B |
| 138 | XOR $a0,$a1,$A |
| 139 | ___ |
| 140 | &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) |
| 141 | $code.=<<___; |
| 142 | XOR A28,A31,A29 |
| 143 | || XOR B28,B31,B29 ; a0·b0+a1·b1 |
| 144 | ___ |
| 145 | &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) |
| 146 | $code.=<<___; |
| 147 | || BNOP B3 |
| 148 | XOR A29,A30,A30 |
| 149 | || XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 |
| 150 | XOR B28,A30,A30 |
| 151 | || STW A28,*${rp}[0] |
| 152 | XOR B30,A31,A31 |
| 153 | || STW A30,*${rp}[1] |
| 154 | STW A31,*${rp}[2] |
| 155 | STW B31,*${rp}[3] |
| 156 | .endasmfunc |
| 157 | ___ |
| 158 | |
| 159 | print $code; |
| 160 | close STDOUT or die "error closing STDOUT: $!"; |