yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | # |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | # |
| 17 | # December 2011 |
| 18 | # |
| 19 | # The module implements GCM GHASH function and underlying single |
| 20 | # multiplication operation in GF(2^128). Even though subroutines |
| 21 | # have _4bit suffix, they are not using any tables, but rely on |
| 22 | # hardware Galois Field Multiply support. Streamed GHASH processes |
| 23 | # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven |
| 24 | # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are |
| 25 | # comparing apples vs. oranges, but compiler surely could have done |
| 26 | # better, because theoretical [though not necessarily achievable] |
| 27 | # estimate for "4-bit" table-driven implementation is ~12 cycles. |
| 28 | |
| 29 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} |
| 30 | open STDOUT,">$output"; |
| 31 | |
| 32 | ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments |
| 33 | |
| 34 | ($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, |
| 35 | $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); |
| 36 | ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, |
| 37 | $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); |
| 38 | ($FF000000,$E10000)=("B30","B31"); |
| 39 | ($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len |
| 40 | $xia="A9"; |
| 41 | ($rem,$res)=("B4","B5"); # $rem zaps $Htable |
| 42 | |
| 43 | $code.=<<___; |
| 44 | .text |
| 45 | |
| 46 | .if .ASSEMBLER_VERSION<7000000 |
| 47 | .asg 0,__TI_EABI__ |
| 48 | .endif |
| 49 | .if __TI_EABI__ |
| 50 | .asg gcm_gmult_1bit,_gcm_gmult_1bit |
| 51 | .asg gcm_gmult_4bit,_gcm_gmult_4bit |
| 52 | .asg gcm_ghash_4bit,_gcm_ghash_4bit |
| 53 | .endif |
| 54 | |
| 55 | .asg B3,RA |
| 56 | |
| 57 | .if 0 |
| 58 | .global _gcm_gmult_1bit |
| 59 | _gcm_gmult_1bit: |
| 60 | ADDAD $Htable,2,$Htable |
| 61 | .endif |
| 62 | .global _gcm_gmult_4bit |
| 63 | _gcm_gmult_4bit: |
| 64 | .asmfunc |
| 65 | LDDW *${Htable}[-1],$H1:$H0 ; H.lo |
| 66 | LDDW *${Htable}[-2],$H3:$H2 ; H.hi |
| 67 | || MV $Xip,${xip} ; reassign Xi |
| 68 | || MVK 15,B1 ; SPLOOPD constant |
| 69 | |
| 70 | MVK 0xE1,$E10000 |
| 71 | || LDBU *++${xip}[15],$x1 ; Xi[15] |
| 72 | MVK 0xFF,$FF000000 |
| 73 | || LDBU *--${xip},$x0 ; Xi[14] |
| 74 | SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial |
| 75 | SHL $FF000000,24,$FF000000 ; upper byte mask |
| 76 | || BNOP ghash_loop? |
| 77 | || MVK 1,B0 ; take a single spin |
| 78 | |
| 79 | PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes |
| 80 | AND $H2,$FF000000,$H2u ; H2's upper byte |
| 81 | AND $H3,$FF000000,$H3u ; H3's upper byte |
| 82 | || SHRU $H2u,8,$H2u |
| 83 | SHRU $H3u,8,$H3u |
| 84 | || ZERO $Z1:$Z0 |
| 85 | SHRU2 $xia,8,$H01u |
| 86 | || ZERO $Z3:$Z2 |
| 87 | .endasmfunc |
| 88 | |
| 89 | .global _gcm_ghash_4bit |
| 90 | _gcm_ghash_4bit: |
| 91 | .asmfunc |
| 92 | LDDW *${Htable}[-1],$H1:$H0 ; H.lo |
| 93 | || SHRU $len,4,B0 ; reassign len |
| 94 | LDDW *${Htable}[-2],$H3:$H2 ; H.hi |
| 95 | || MV $Xip,${xip} ; reassign Xi |
| 96 | || MVK 15,B1 ; SPLOOPD constant |
| 97 | |
| 98 | MVK 0xE1,$E10000 |
| 99 | || [B0] LDNDW *${inp}[1],$H1x:$H0x |
| 100 | MVK 0xFF,$FF000000 |
| 101 | || [B0] LDNDW *${inp}++[2],$H3x:$H2x |
| 102 | SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial |
| 103 | || LDDW *${xip}[1],$Z1:$Z0 |
| 104 | SHL $FF000000,24,$FF000000 ; upper byte mask |
| 105 | || LDDW *${xip}[0],$Z3:$Z2 |
| 106 | |
| 107 | PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes |
| 108 | AND $H2,$FF000000,$H2u ; H2's upper byte |
| 109 | AND $H3,$FF000000,$H3u ; H3's upper byte |
| 110 | || SHRU $H2u,8,$H2u |
| 111 | SHRU $H3u,8,$H3u |
| 112 | SHRU2 $xia,8,$H01u |
| 113 | |
| 114 | || [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp |
| 115 | || [B0] XOR $H1x,$Z1,$Z1 |
| 116 | .if .LITTLE_ENDIAN |
| 117 | [B0] XOR $H2x,$Z2,$Z2 |
| 118 | || [B0] XOR $H3x,$Z3,$Z3 |
| 119 | || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall |
| 120 | STDW $Z1:$Z0,*${xip}[1] |
| 121 | || [B0] SHRU $Z1,16,$x0 ; Xi[14] |
| 122 | || [B0] ZERO $Z1:$Z0 |
| 123 | .else |
| 124 | [B0] XOR $H2x,$Z2,$Z2 |
| 125 | || [B0] XOR $H3x,$Z3,$Z3 |
| 126 | || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall |
| 127 | STDW $Z1:$Z0,*${xip}[1] |
| 128 | || [B0] SHRU $Z0,8,$x0 ; Xi[14] |
| 129 | || [B0] ZERO $Z1:$Z0 |
| 130 | .endif |
| 131 | STDW $Z3:$Z2,*${xip}[0] |
| 132 | || [B0] ZERO $Z3:$Z2 |
| 133 | || [B0] MV $xia,$x1 |
| 134 | [B0] ADDK 14,${xip} |
| 135 | |
| 136 | ghash_loop?: |
| 137 | SPLOOPD 6 ; 6*16+7 |
| 138 | || MVC B1,ILC |
| 139 | || [B0] SUB B0,1,B0 |
| 140 | || ZERO A0 |
| 141 | || ADD $x1,$x1,$xib ; SHL $x1,1,$xib |
| 142 | || SHL $x1,1,$xia |
| 143 | ___ |
| 144 | |
| 145 | ########____________________________ |
| 146 | # 0 D2. M1 M2 | |
| 147 | # 1 M1 | |
| 148 | # 2 M1 M2 | |
| 149 | # 3 D1. M1 M2 | |
| 150 | # 4 S1. L1 | |
| 151 | # 5 S2 S1x L1 D2 L2 |____________________________ |
| 152 | # 6/0 L1 S1 L2 S2x |D2. M1 M2 | |
| 153 | # 7/1 L1 S1 D1x S2 M2 | M1 | |
| 154 | # 8/2 S1 L1x S2 | M1 M2 | |
| 155 | # 9/3 S1 L1x | D1. M1 M2 | |
| 156 | # 10/4 D1x | S1. L1 | |
| 157 | # 11/5 |S2 S1x L1 D2 L2 |____________ |
| 158 | # 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... |
| 159 | # 7/1 L1 S1 D1x S2 M2 | .... |
| 160 | # 8/2 S1 L1x S2 | .... |
| 161 | #####... ................|............ |
| 162 | $code.=<<___; |
| 163 | XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) |
| 164 | || XORMPY $H01u,$xib,$H01y |
| 165 | || [A0] LDBU *--${xip},$x0 |
| 166 | XORMPY $H1,$xia,$H1x ; 1 |
| 167 | XORMPY $H2,$xia,$H2x ; 2 |
| 168 | || XORMPY $H2u,$xib,$H2y |
| 169 | XORMPY $H3,$xia,$H3x ; 3 |
| 170 | || XORMPY $H3u,$xib,$H3y |
| 171 | ||[!A0] MVK.D 15,A0 ; *--${xip} counter |
| 172 | XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) |
| 173 | || [A0] SUB.S A0,1,A0 |
| 174 | XOR.L $H1x,$Z1,$Z1 ; 5 |
| 175 | || AND.D $H01y,$FF000000,$H0z |
| 176 | || SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y |
| 177 | || SHL $x0,1,$xib |
| 178 | || SHL $x0,1,$xia |
| 179 | |
| 180 | XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue |
| 181 | || SHL $Z0,1,$rem ; ; rem=Z<<1 |
| 182 | || SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 |
| 183 | || AND.L $H1y,$FF000000,$H1z |
| 184 | XOR.L $H3x,$Z3,$Z3 ; 7/1 |
| 185 | || SHRMB.S $Z2,$Z1,$Z1 |
| 186 | || XOR.D $H0z,$Z0,$Z0 ; merge upper byte products |
| 187 | || AND.S $H2y,$FF000000,$H2z |
| 188 | || XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE |
| 189 | XOR.L $H1z,$Z1,$Z1 ; 8/2 |
| 190 | || SHRMB.S $Z3,$Z2,$Z2 |
| 191 | || AND.S $H3y,$FF000000,$H3z |
| 192 | XOR.L $H2z,$Z2,$Z2 ; 9/3 |
| 193 | || SHRU $Z3,8,$Z3 |
| 194 | XOR.D $H3z,$Z3,$Z3 ; 10/4 |
| 195 | NOP ; 11/5 |
| 196 | |
| 197 | SPKERNEL 0,2 |
| 198 | || XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res |
| 199 | |
| 200 | ; input pre-fetch is possible where D1 slot is available... |
| 201 | [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- |
| 202 | [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- |
| 203 | NOP ; 10/- |
| 204 | .if .LITTLE_ENDIAN |
| 205 | SWAP2 $Z0,$Z1 ; 11/- |
| 206 | || SWAP4 $Z1,$Z0 |
| 207 | SWAP4 $Z1,$Z1 ; 12/- |
| 208 | || SWAP2 $Z0,$Z0 |
| 209 | SWAP2 $Z2,$Z3 |
| 210 | || SWAP4 $Z3,$Z2 |
| 211 | ||[!B0] BNOP RA |
| 212 | SWAP4 $Z3,$Z3 |
| 213 | || SWAP2 $Z2,$Z2 |
| 214 | || [B0] BNOP ghash_loop? |
| 215 | [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp |
| 216 | || [B0] XOR $H1x,$Z1,$Z1 |
| 217 | [B0] XOR $H2x,$Z2,$Z2 |
| 218 | || [B0] XOR $H3x,$Z3,$Z3 |
| 219 | || [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall |
| 220 | STDW $Z1:$Z0,*${xip}[1] |
| 221 | || [B0] SHRU $Z1,16,$x0 ; Xi[14] |
| 222 | || [B0] ZERO $Z1:$Z0 |
| 223 | .else |
| 224 | [!B0] BNOP RA ; 11/- |
| 225 | [B0] BNOP ghash_loop? ; 12/- |
| 226 | [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp |
| 227 | || [B0] XOR $H1x,$Z1,$Z1 |
| 228 | [B0] XOR $H2x,$Z2,$Z2 |
| 229 | || [B0] XOR $H3x,$Z3,$Z3 |
| 230 | || [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall |
| 231 | STDW $Z1:$Z0,*${xip}[1] |
| 232 | || [B0] SHRU $Z0,8,$x0 ; Xi[14] |
| 233 | || [B0] ZERO $Z1:$Z0 |
| 234 | .endif |
| 235 | STDW $Z3:$Z2,*${xip}[0] |
| 236 | || [B0] ZERO $Z3:$Z2 |
| 237 | || [B0] MV $xia,$x1 |
| 238 | [B0] ADDK 14,${xip} |
| 239 | .endasmfunc |
| 240 | |
| 241 | .sect .const |
| 242 | .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>" |
| 243 | .align 4 |
| 244 | ___ |
| 245 | |
| 246 | print $code; |
| 247 | close STDOUT or die "error closing STDOUT: $!"; |