yuezonghe | 824eb0c | 2024-06-27 02:32:26 -0700 | [diff] [blame^] | 1 | #! /usr/bin/env perl |
| 2 | # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. |
| 3 | # |
| 4 | # Licensed under the OpenSSL license (the "License"). You may not use |
| 5 | # this file except in compliance with the License. You can obtain a copy |
| 6 | # in the file LICENSE in the source distribution or at |
| 7 | # https://www.openssl.org/source/license.html |
| 8 | |
| 9 | |
| 10 | # ==================================================================== |
| 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 12 | # project. The module is, however, dual licensed under OpenSSL and |
| 13 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 14 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 15 | # ==================================================================== |
| 16 | |
| 17 | # September 2011 |
| 18 | # |
| 19 | # Assembler helpers for Padlock engine. Compared to original engine |
| 20 | # version relying on inline assembler and compiled with gcc 3.4.6 it |
| 21 | # was measured to provide ~100% improvement on misaligned data in ECB |
| 22 | # mode and ~75% in CBC mode. For aligned data improvement can be |
| 23 | # observed for short inputs only, e.g. 45% for 64-byte messages in |
| 24 | # ECB mode, 20% in CBC. Difference in performance for aligned vs. |
| 25 | # misaligned data depends on misalignment and is either ~1.8x or 2.9x. |
| 26 | # These are approximately same factors as for hardware support, so |
| 27 | # there is little reason to rely on the latter. On the contrary, it |
| 28 | # might actually hurt performance in mixture of aligned and misaligned |
| 29 | # buffers, because a) if you choose to flip 'align' flag in control |
| 30 | # word on per-buffer basis, then you'd have to reload key context, |
| 31 | # which incurs penalty; b) if you choose to set 'align' flag |
| 32 | # permanently, it limits performance even for aligned data to ~1/2. |
| 33 | # All above mentioned results were collected on 1.5GHz C7. Nano on the |
| 34 | # other hand handles unaligned data more gracefully. Depending on |
| 35 | # algorithm and how unaligned data is, hardware can be up to 70% more |
| 36 | # efficient than below software alignment procedures, nor does 'align' |
| 37 | # flag have affect on aligned performance [if has any meaning at all]. |
| 38 | # Therefore suggestion is to unconditionally set 'align' flag on Nano |
| 39 | # for optimal performance. |
| 40 | |
| 41 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 42 | push(@INC,"${dir}","${dir}../../crypto/perlasm"); |
| 43 | require "x86asm.pl"; |
| 44 | |
| 45 | $output=pop; |
| 46 | open STDOUT,">$output"; |
| 47 | |
| 48 | &asm_init($ARGV[0]); |
| 49 | |
| 50 | %PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata |
| 51 | $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 |
| 52 | |
| 53 | $ctx="edx"; |
| 54 | $out="edi"; |
| 55 | $inp="esi"; |
| 56 | $len="ecx"; |
| 57 | $chunk="ebx"; |
| 58 | |
| 59 | &function_begin_B("padlock_capability"); |
| 60 | &push ("ebx"); |
| 61 | &pushf (); |
| 62 | &pop ("eax"); |
| 63 | &mov ("ecx","eax"); |
| 64 | &xor ("eax",1<<21); |
| 65 | &push ("eax"); |
| 66 | &popf (); |
| 67 | &pushf (); |
| 68 | &pop ("eax"); |
| 69 | &xor ("ecx","eax"); |
| 70 | &xor ("eax","eax"); |
| 71 | &bt ("ecx",21); |
| 72 | &jnc (&label("noluck")); |
| 73 | &cpuid (); |
| 74 | &xor ("eax","eax"); |
| 75 | &cmp ("ebx","0x".unpack("H*",'tneC')); |
| 76 | &jne (&label("zhaoxin")); |
| 77 | &cmp ("edx","0x".unpack("H*",'Hrua')); |
| 78 | &jne (&label("noluck")); |
| 79 | &cmp ("ecx","0x".unpack("H*",'slua')); |
| 80 | &jne (&label("noluck")); |
| 81 | &jmp (&label("zhaoxinEnd")); |
| 82 | &set_label("zhaoxin"); |
| 83 | &cmp ("ebx","0x".unpack("H*",'hS ')); |
| 84 | &jne (&label("noluck")); |
| 85 | &cmp ("edx","0x".unpack("H*",'hgna')); |
| 86 | &jne (&label("noluck")); |
| 87 | &cmp ("ecx","0x".unpack("H*",' ia')); |
| 88 | &jne (&label("noluck")); |
| 89 | &set_label("zhaoxinEnd"); |
| 90 | &mov ("eax",0xC0000000); |
| 91 | &cpuid (); |
| 92 | &mov ("edx","eax"); |
| 93 | &xor ("eax","eax"); |
| 94 | &cmp ("edx",0xC0000001); |
| 95 | &jb (&label("noluck")); |
| 96 | &mov ("eax",1); |
| 97 | &cpuid (); |
| 98 | &or ("eax",0x0f); |
| 99 | &xor ("ebx","ebx"); |
| 100 | &and ("eax",0x0fff); |
| 101 | &cmp ("eax",0x06ff); # check for Nano |
| 102 | &sete ("bl"); |
| 103 | &mov ("eax",0xC0000001); |
| 104 | &push ("ebx"); |
| 105 | &cpuid (); |
| 106 | &pop ("ebx"); |
| 107 | &mov ("eax","edx"); |
| 108 | &shl ("ebx",4); # bit#4 denotes Nano |
| 109 | &and ("eax",0xffffffef); |
| 110 | &or ("eax","ebx") |
| 111 | &set_label("noluck"); |
| 112 | &pop ("ebx"); |
| 113 | &ret (); |
| 114 | &function_end_B("padlock_capability") |
| 115 | |
| 116 | &function_begin_B("padlock_key_bswap"); |
| 117 | &mov ("edx",&wparam(0)); |
| 118 | &mov ("ecx",&DWP(240,"edx")); |
| 119 | &set_label("bswap_loop"); |
| 120 | &mov ("eax",&DWP(0,"edx")); |
| 121 | &bswap ("eax"); |
| 122 | &mov (&DWP(0,"edx"),"eax"); |
| 123 | &lea ("edx",&DWP(4,"edx")); |
| 124 | &sub ("ecx",1); |
| 125 | &jnz (&label("bswap_loop")); |
| 126 | &ret (); |
| 127 | &function_end_B("padlock_key_bswap"); |
| 128 | |
| 129 | # This is heuristic key context tracing. At first one |
| 130 | # believes that one should use atomic swap instructions, |
| 131 | # but it's not actually necessary. Point is that if |
| 132 | # padlock_saved_context was changed by another thread |
| 133 | # after we've read it and before we compare it with ctx, |
| 134 | # our key *shall* be reloaded upon thread context switch |
| 135 | # and we are therefore set in either case... |
| 136 | &static_label("padlock_saved_context"); |
| 137 | |
| 138 | &function_begin_B("padlock_verify_context"); |
| 139 | &mov ($ctx,&wparam(0)); |
| 140 | &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : |
| 141 | &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); |
| 142 | &pushf (); |
| 143 | &call ("_padlock_verify_ctx"); |
| 144 | &set_label("verify_pic_point"); |
| 145 | &lea ("esp",&DWP(4,"esp")); |
| 146 | &ret (); |
| 147 | &function_end_B("padlock_verify_context"); |
| 148 | |
| 149 | &function_begin_B("_padlock_verify_ctx"); |
| 150 | &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context |
| 151 | &bt (&DWP(4,"esp"),30); # eflags |
| 152 | &jnc (&label("verified")); |
| 153 | &cmp ($ctx,&DWP(0,"eax")); |
| 154 | &je (&label("verified")); |
| 155 | &pushf (); |
| 156 | &popf (); |
| 157 | &set_label("verified"); |
| 158 | &mov (&DWP(0,"eax"),$ctx); |
| 159 | &ret (); |
| 160 | &function_end_B("_padlock_verify_ctx"); |
| 161 | |
| 162 | &function_begin_B("padlock_reload_key"); |
| 163 | &pushf (); |
| 164 | &popf (); |
| 165 | &ret (); |
| 166 | &function_end_B("padlock_reload_key"); |
| 167 | |
| 168 | &function_begin_B("padlock_aes_block"); |
| 169 | &push ("edi"); |
| 170 | &push ("esi"); |
| 171 | &push ("ebx"); |
| 172 | &mov ($out,&wparam(0)); # must be 16-byte aligned |
| 173 | &mov ($inp,&wparam(1)); # must be 16-byte aligned |
| 174 | &mov ($ctx,&wparam(2)); |
| 175 | &mov ($len,1); |
| 176 | &lea ("ebx",&DWP(32,$ctx)); # key |
| 177 | &lea ($ctx,&DWP(16,$ctx)); # control word |
| 178 | &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb |
| 179 | &pop ("ebx"); |
| 180 | &pop ("esi"); |
| 181 | &pop ("edi"); |
| 182 | &ret (); |
| 183 | &function_end_B("padlock_aes_block"); |
| 184 | |
| 185 | sub generate_mode { |
| 186 | my ($mode,$opcode) = @_; |
| 187 | # int padlock_$mode_encrypt(void *out, const void *inp, |
| 188 | # struct padlock_cipher_data *ctx, size_t len); |
| 189 | &function_begin("padlock_${mode}_encrypt"); |
| 190 | &mov ($out,&wparam(0)); |
| 191 | &mov ($inp,&wparam(1)); |
| 192 | &mov ($ctx,&wparam(2)); |
| 193 | &mov ($len,&wparam(3)); |
| 194 | &test ($ctx,15); |
| 195 | &jnz (&label("${mode}_abort")); |
| 196 | &test ($len,15); |
| 197 | &jnz (&label("${mode}_abort")); |
| 198 | &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : |
| 199 | &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); |
| 200 | &pushf (); |
| 201 | &cld (); |
| 202 | &call ("_padlock_verify_ctx"); |
| 203 | &set_label("${mode}_pic_point"); |
| 204 | &lea ($ctx,&DWP(16,$ctx)); # control word |
| 205 | &xor ("eax","eax"); |
| 206 | if ($mode eq "ctr32") { |
| 207 | &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter |
| 208 | } else { |
| 209 | &xor ("ebx","ebx"); |
| 210 | &test (&DWP(0,$ctx),1<<5); # align bit in control word |
| 211 | &jnz (&label("${mode}_aligned")); |
| 212 | &test ($out,0x0f); |
| 213 | &setz ("al"); # !out_misaligned |
| 214 | &test ($inp,0x0f); |
| 215 | &setz ("bl"); # !inp_misaligned |
| 216 | &test ("eax","ebx"); |
| 217 | &jnz (&label("${mode}_aligned")); |
| 218 | &neg ("eax"); |
| 219 | } |
| 220 | &mov ($chunk,$PADLOCK_CHUNK); |
| 221 | ¬ ("eax"); # out_misaligned?-1:0 |
| 222 | &lea ("ebp",&DWP(-24,"esp")); |
| 223 | &cmp ($len,$chunk); |
| 224 | &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len |
| 225 | &and ("eax",$chunk); # out_misaligned?chunk:0 |
| 226 | &mov ($chunk,$len); |
| 227 | &neg ("eax"); |
| 228 | &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK |
| 229 | &lea ("esp",&DWP(0,"eax","ebp")); # alloca |
| 230 | &mov ("eax",$PADLOCK_CHUNK); |
| 231 | &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK |
| 232 | &mov ("eax","ebp"); |
| 233 | &and ("ebp",-16); |
| 234 | &and ("esp",-16); |
| 235 | &mov (&DWP(16,"ebp"),"eax"); |
| 236 | if ($PADLOCK_PREFETCH{$mode}) { |
| 237 | &cmp ($len,$chunk); |
| 238 | &ja (&label("${mode}_loop")); |
| 239 | &mov ("eax",$inp); # check if prefetch crosses page |
| 240 | &cmp ("ebp","esp"); |
| 241 | &cmove ("eax",$out); |
| 242 | &add ("eax",$len); |
| 243 | &neg ("eax"); |
| 244 | &and ("eax",0xfff); # distance to page boundary |
| 245 | &cmp ("eax",$PADLOCK_PREFETCH{$mode}); |
| 246 | &mov ("eax",-$PADLOCK_PREFETCH{$mode}); |
| 247 | &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 |
| 248 | &and ($chunk,"eax"); |
| 249 | &jz (&label("${mode}_unaligned_tail")); |
| 250 | } |
| 251 | &jmp (&label("${mode}_loop")); |
| 252 | |
| 253 | &set_label("${mode}_loop",16); |
| 254 | &mov (&DWP(0,"ebp"),$out); # save parameters |
| 255 | &mov (&DWP(4,"ebp"),$inp); |
| 256 | &mov (&DWP(8,"ebp"),$len); |
| 257 | &mov ($len,$chunk); |
| 258 | &mov (&DWP(12,"ebp"),$chunk); # chunk |
| 259 | if ($mode eq "ctr32") { |
| 260 | &mov ("ecx",&DWP(-4,$ctx)); |
| 261 | &xor ($out,$out); |
| 262 | &mov ("eax",&DWP(-8,$ctx)); # borrow $len |
| 263 | &set_label("${mode}_prepare"); |
| 264 | &mov (&DWP(12,"esp",$out),"ecx"); |
| 265 | &bswap ("ecx"); |
| 266 | &movq (&QWP(0,"esp",$out),"mm0"); |
| 267 | &inc ("ecx"); |
| 268 | &mov (&DWP(8,"esp",$out),"eax"); |
| 269 | &bswap ("ecx"); |
| 270 | &lea ($out,&DWP(16,$out)); |
| 271 | &cmp ($out,$chunk); |
| 272 | &jb (&label("${mode}_prepare")); |
| 273 | |
| 274 | &mov (&DWP(-4,$ctx),"ecx"); |
| 275 | &lea ($inp,&DWP(0,"esp")); |
| 276 | &lea ($out,&DWP(0,"esp")); |
| 277 | &mov ($len,$chunk); |
| 278 | } else { |
| 279 | &test ($out,0x0f); # out_misaligned |
| 280 | &cmovnz ($out,"esp"); |
| 281 | &test ($inp,0x0f); # inp_misaligned |
| 282 | &jz (&label("${mode}_inp_aligned")); |
| 283 | &shr ($len,2); |
| 284 | &data_byte(0xf3,0xa5); # rep movsl |
| 285 | &sub ($out,$chunk); |
| 286 | &mov ($len,$chunk); |
| 287 | &mov ($inp,$out); |
| 288 | &set_label("${mode}_inp_aligned"); |
| 289 | } |
| 290 | &lea ("eax",&DWP(-16,$ctx)); # ivp |
| 291 | &lea ("ebx",&DWP(16,$ctx)); # key |
| 292 | &shr ($len,4); # len/=AES_BLOCK_SIZE |
| 293 | &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* |
| 294 | if ($mode !~ /ecb|ctr/) { |
| 295 | &movaps ("xmm0",&QWP(0,"eax")); |
| 296 | &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv |
| 297 | } |
| 298 | &mov ($out,&DWP(0,"ebp")); # restore parameters |
| 299 | &mov ($chunk,&DWP(12,"ebp")); |
| 300 | if ($mode eq "ctr32") { |
| 301 | &mov ($inp,&DWP(4,"ebp")); |
| 302 | &xor ($len,$len); |
| 303 | &set_label("${mode}_xor"); |
| 304 | &movups ("xmm1",&QWP(0,$inp,$len)); |
| 305 | &lea ($len,&DWP(16,$len)); |
| 306 | &pxor ("xmm1",&QWP(-16,"esp",$len)); |
| 307 | &movups (&QWP(-16,$out,$len),"xmm1"); |
| 308 | &cmp ($len,$chunk); |
| 309 | &jb (&label("${mode}_xor")); |
| 310 | } else { |
| 311 | &test ($out,0x0f); |
| 312 | &jz (&label("${mode}_out_aligned")); |
| 313 | &mov ($len,$chunk); |
| 314 | &lea ($inp,&DWP(0,"esp")); |
| 315 | &shr ($len,2); |
| 316 | &data_byte(0xf3,0xa5); # rep movsl |
| 317 | &sub ($out,$chunk); |
| 318 | &set_label("${mode}_out_aligned"); |
| 319 | &mov ($inp,&DWP(4,"ebp")); |
| 320 | } |
| 321 | &mov ($len,&DWP(8,"ebp")); |
| 322 | &add ($out,$chunk); |
| 323 | &add ($inp,$chunk); |
| 324 | &sub ($len,$chunk); |
| 325 | &mov ($chunk,$PADLOCK_CHUNK); |
| 326 | if (!$PADLOCK_PREFETCH{$mode}) { |
| 327 | &jnz (&label("${mode}_loop")); |
| 328 | } else { |
| 329 | &jz (&label("${mode}_break")); |
| 330 | &cmp ($len,$chunk); |
| 331 | &jae (&label("${mode}_loop")); |
| 332 | |
| 333 | &set_label("${mode}_unaligned_tail"); |
| 334 | &xor ("eax","eax"); |
| 335 | &cmp ("esp","ebp"); |
| 336 | &cmove ("eax",$len); |
| 337 | &sub ("esp","eax"); # alloca |
| 338 | &mov ("eax", $out); # save parameters |
| 339 | &mov ($chunk,$len); |
| 340 | &shr ($len,2); |
| 341 | &lea ($out,&DWP(0,"esp")); |
| 342 | &data_byte(0xf3,0xa5); # rep movsl |
| 343 | &mov ($inp,"esp"); |
| 344 | &mov ($out,"eax"); # restore parameters |
| 345 | &mov ($len,$chunk); |
| 346 | &jmp (&label("${mode}_loop")); |
| 347 | |
| 348 | &set_label("${mode}_break",16); |
| 349 | } |
| 350 | if ($mode ne "ctr32") { |
| 351 | &cmp ("esp","ebp"); |
| 352 | &je (&label("${mode}_done")); |
| 353 | } |
| 354 | &pxor ("xmm0","xmm0"); |
| 355 | &lea ("eax",&DWP(0,"esp")); |
| 356 | &set_label("${mode}_bzero"); |
| 357 | &movaps (&QWP(0,"eax"),"xmm0"); |
| 358 | &lea ("eax",&DWP(16,"eax")); |
| 359 | &cmp ("ebp","eax"); |
| 360 | &ja (&label("${mode}_bzero")); |
| 361 | |
| 362 | &set_label("${mode}_done"); |
| 363 | &mov ("ebp",&DWP(16,"ebp")); |
| 364 | &lea ("esp",&DWP(24,"ebp")); |
| 365 | if ($mode ne "ctr32") { |
| 366 | &jmp (&label("${mode}_exit")); |
| 367 | |
| 368 | &set_label("${mode}_aligned",16); |
| 369 | if ($PADLOCK_PREFETCH{$mode}) { |
| 370 | &lea ("ebp",&DWP(0,$inp,$len)); |
| 371 | &neg ("ebp"); |
| 372 | &and ("ebp",0xfff); # distance to page boundary |
| 373 | &xor ("eax","eax"); |
| 374 | &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); |
| 375 | &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); |
| 376 | &cmovae ("ebp","eax"); |
| 377 | &and ("ebp",$len); # remainder |
| 378 | &sub ($len,"ebp"); |
| 379 | &jz (&label("${mode}_aligned_tail")); |
| 380 | } |
| 381 | &lea ("eax",&DWP(-16,$ctx)); # ivp |
| 382 | &lea ("ebx",&DWP(16,$ctx)); # key |
| 383 | &shr ($len,4); # len/=AES_BLOCK_SIZE |
| 384 | &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* |
| 385 | if ($mode ne "ecb") { |
| 386 | &movaps ("xmm0",&QWP(0,"eax")); |
| 387 | &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv |
| 388 | } |
| 389 | if ($PADLOCK_PREFETCH{$mode}) { |
| 390 | &test ("ebp","ebp"); |
| 391 | &jz (&label("${mode}_exit")); |
| 392 | |
| 393 | &set_label("${mode}_aligned_tail"); |
| 394 | &mov ($len,"ebp"); |
| 395 | &lea ("ebp",&DWP(-24,"esp")); |
| 396 | &mov ("esp","ebp"); |
| 397 | &mov ("eax","ebp"); |
| 398 | &sub ("esp",$len); |
| 399 | &and ("ebp",-16); |
| 400 | &and ("esp",-16); |
| 401 | &mov (&DWP(16,"ebp"),"eax"); |
| 402 | &mov ("eax", $out); # save parameters |
| 403 | &mov ($chunk,$len); |
| 404 | &shr ($len,2); |
| 405 | &lea ($out,&DWP(0,"esp")); |
| 406 | &data_byte(0xf3,0xa5); # rep movsl |
| 407 | &mov ($inp,"esp"); |
| 408 | &mov ($out,"eax"); # restore parameters |
| 409 | &mov ($len,$chunk); |
| 410 | &jmp (&label("${mode}_loop")); |
| 411 | } |
| 412 | &set_label("${mode}_exit"); } |
| 413 | &mov ("eax",1); |
| 414 | &lea ("esp",&DWP(4,"esp")); # popf |
| 415 | &emms () if ($mode eq "ctr32"); |
| 416 | &set_label("${mode}_abort"); |
| 417 | &function_end("padlock_${mode}_encrypt"); |
| 418 | } |
| 419 | |
| 420 | &generate_mode("ecb",0xc8); |
| 421 | &generate_mode("cbc",0xd0); |
| 422 | &generate_mode("cfb",0xe0); |
| 423 | &generate_mode("ofb",0xe8); |
| 424 | &generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, |
| 425 | # because hardware CTR was introduced later |
| 426 | # and even has errata on certain C7 stepping. |
| 427 | # own implementation *always* works, though |
| 428 | # ~15% slower than dedicated hardware... |
| 429 | |
| 430 | &function_begin_B("padlock_xstore"); |
| 431 | &push ("edi"); |
| 432 | &mov ("edi",&wparam(0)); |
| 433 | &mov ("edx",&wparam(1)); |
| 434 | &data_byte(0x0f,0xa7,0xc0); # xstore |
| 435 | &pop ("edi"); |
| 436 | &ret (); |
| 437 | &function_end_B("padlock_xstore"); |
| 438 | |
| 439 | &function_begin_B("_win32_segv_handler"); |
| 440 | &mov ("eax",1); # ExceptionContinueSearch |
| 441 | &mov ("edx",&wparam(0)); # *ExceptionRecord |
| 442 | &mov ("ecx",&wparam(2)); # *ContextRecord |
| 443 | &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION |
| 444 | &jne (&label("ret")); |
| 445 | &add (&DWP(184,"ecx"),4); # skip over rep sha* |
| 446 | &mov ("eax",0); # ExceptionContinueExecution |
| 447 | &set_label("ret"); |
| 448 | &ret (); |
| 449 | &function_end_B("_win32_segv_handler"); |
| 450 | &safeseh("_win32_segv_handler") if ($::win32); |
| 451 | |
| 452 | &function_begin_B("padlock_sha1_oneshot"); |
| 453 | &push ("edi"); |
| 454 | &push ("esi"); |
| 455 | &xor ("eax","eax"); |
| 456 | &mov ("edi",&wparam(0)); |
| 457 | &mov ("esi",&wparam(1)); |
| 458 | &mov ("ecx",&wparam(2)); |
| 459 | if ($::win32 or $::coff) { |
| 460 | &push (&::islabel("_win32_segv_handler")); |
| 461 | &data_byte(0x64,0xff,0x30); # push %fs:(%eax) |
| 462 | &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) |
| 463 | } |
| 464 | &mov ("edx","esp"); # put aside %esp |
| 465 | &add ("esp",-128); # 32 is enough but spec says 128 |
| 466 | &movups ("xmm0",&QWP(0,"edi")); # copy-in context |
| 467 | &and ("esp",-16); |
| 468 | &mov ("eax",&DWP(16,"edi")); |
| 469 | &movaps (&QWP(0,"esp"),"xmm0"); |
| 470 | &mov ("edi","esp"); |
| 471 | &mov (&DWP(16,"esp"),"eax"); |
| 472 | &xor ("eax","eax"); |
| 473 | &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 |
| 474 | &movaps ("xmm0",&QWP(0,"esp")); |
| 475 | &mov ("eax",&DWP(16,"esp")); |
| 476 | &mov ("esp","edx"); # restore %esp |
| 477 | if ($::win32 or $::coff) { |
| 478 | &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 |
| 479 | &lea ("esp",&DWP(4,"esp")); |
| 480 | } |
| 481 | &mov ("edi",&wparam(0)); |
| 482 | &movups (&QWP(0,"edi"),"xmm0"); # copy-out context |
| 483 | &mov (&DWP(16,"edi"),"eax"); |
| 484 | &pop ("esi"); |
| 485 | &pop ("edi"); |
| 486 | &ret (); |
| 487 | &function_end_B("padlock_sha1_oneshot"); |
| 488 | |
| 489 | &function_begin_B("padlock_sha1_blocks"); |
| 490 | &push ("edi"); |
| 491 | &push ("esi"); |
| 492 | &mov ("edi",&wparam(0)); |
| 493 | &mov ("esi",&wparam(1)); |
| 494 | &mov ("edx","esp"); # put aside %esp |
| 495 | &mov ("ecx",&wparam(2)); |
| 496 | &add ("esp",-128); |
| 497 | &movups ("xmm0",&QWP(0,"edi")); # copy-in context |
| 498 | &and ("esp",-16); |
| 499 | &mov ("eax",&DWP(16,"edi")); |
| 500 | &movaps (&QWP(0,"esp"),"xmm0"); |
| 501 | &mov ("edi","esp"); |
| 502 | &mov (&DWP(16,"esp"),"eax"); |
| 503 | &mov ("eax",-1); |
| 504 | &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 |
| 505 | &movaps ("xmm0",&QWP(0,"esp")); |
| 506 | &mov ("eax",&DWP(16,"esp")); |
| 507 | &mov ("esp","edx"); # restore %esp |
| 508 | &mov ("edi",&wparam(0)); |
| 509 | &movups (&QWP(0,"edi"),"xmm0"); # copy-out context |
| 510 | &mov (&DWP(16,"edi"),"eax"); |
| 511 | &pop ("esi"); |
| 512 | &pop ("edi"); |
| 513 | &ret (); |
| 514 | &function_end_B("padlock_sha1_blocks"); |
| 515 | |
| 516 | &function_begin_B("padlock_sha256_oneshot"); |
| 517 | &push ("edi"); |
| 518 | &push ("esi"); |
| 519 | &xor ("eax","eax"); |
| 520 | &mov ("edi",&wparam(0)); |
| 521 | &mov ("esi",&wparam(1)); |
| 522 | &mov ("ecx",&wparam(2)); |
| 523 | if ($::win32 or $::coff) { |
| 524 | &push (&::islabel("_win32_segv_handler")); |
| 525 | &data_byte(0x64,0xff,0x30); # push %fs:(%eax) |
| 526 | &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) |
| 527 | } |
| 528 | &mov ("edx","esp"); # put aside %esp |
| 529 | &add ("esp",-128); |
| 530 | &movups ("xmm0",&QWP(0,"edi")); # copy-in context |
| 531 | &and ("esp",-16); |
| 532 | &movups ("xmm1",&QWP(16,"edi")); |
| 533 | &movaps (&QWP(0,"esp"),"xmm0"); |
| 534 | &mov ("edi","esp"); |
| 535 | &movaps (&QWP(16,"esp"),"xmm1"); |
| 536 | &xor ("eax","eax"); |
| 537 | &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 |
| 538 | &movaps ("xmm0",&QWP(0,"esp")); |
| 539 | &movaps ("xmm1",&QWP(16,"esp")); |
| 540 | &mov ("esp","edx"); # restore %esp |
| 541 | if ($::win32 or $::coff) { |
| 542 | &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 |
| 543 | &lea ("esp",&DWP(4,"esp")); |
| 544 | } |
| 545 | &mov ("edi",&wparam(0)); |
| 546 | &movups (&QWP(0,"edi"),"xmm0"); # copy-out context |
| 547 | &movups (&QWP(16,"edi"),"xmm1"); |
| 548 | &pop ("esi"); |
| 549 | &pop ("edi"); |
| 550 | &ret (); |
| 551 | &function_end_B("padlock_sha256_oneshot"); |
| 552 | |
| 553 | &function_begin_B("padlock_sha256_blocks"); |
| 554 | &push ("edi"); |
| 555 | &push ("esi"); |
| 556 | &mov ("edi",&wparam(0)); |
| 557 | &mov ("esi",&wparam(1)); |
| 558 | &mov ("ecx",&wparam(2)); |
| 559 | &mov ("edx","esp"); # put aside %esp |
| 560 | &add ("esp",-128); |
| 561 | &movups ("xmm0",&QWP(0,"edi")); # copy-in context |
| 562 | &and ("esp",-16); |
| 563 | &movups ("xmm1",&QWP(16,"edi")); |
| 564 | &movaps (&QWP(0,"esp"),"xmm0"); |
| 565 | &mov ("edi","esp"); |
| 566 | &movaps (&QWP(16,"esp"),"xmm1"); |
| 567 | &mov ("eax",-1); |
| 568 | &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 |
| 569 | &movaps ("xmm0",&QWP(0,"esp")); |
| 570 | &movaps ("xmm1",&QWP(16,"esp")); |
| 571 | &mov ("esp","edx"); # restore %esp |
| 572 | &mov ("edi",&wparam(0)); |
| 573 | &movups (&QWP(0,"edi"),"xmm0"); # copy-out context |
| 574 | &movups (&QWP(16,"edi"),"xmm1"); |
| 575 | &pop ("esi"); |
| 576 | &pop ("edi"); |
| 577 | &ret (); |
| 578 | &function_end_B("padlock_sha256_blocks"); |
| 579 | |
| 580 | &function_begin_B("padlock_sha512_blocks"); |
| 581 | &push ("edi"); |
| 582 | &push ("esi"); |
| 583 | &mov ("edi",&wparam(0)); |
| 584 | &mov ("esi",&wparam(1)); |
| 585 | &mov ("ecx",&wparam(2)); |
| 586 | &mov ("edx","esp"); # put aside %esp |
| 587 | &add ("esp",-128); |
| 588 | &movups ("xmm0",&QWP(0,"edi")); # copy-in context |
| 589 | &and ("esp",-16); |
| 590 | &movups ("xmm1",&QWP(16,"edi")); |
| 591 | &movups ("xmm2",&QWP(32,"edi")); |
| 592 | &movups ("xmm3",&QWP(48,"edi")); |
| 593 | &movaps (&QWP(0,"esp"),"xmm0"); |
| 594 | &mov ("edi","esp"); |
| 595 | &movaps (&QWP(16,"esp"),"xmm1"); |
| 596 | &movaps (&QWP(32,"esp"),"xmm2"); |
| 597 | &movaps (&QWP(48,"esp"),"xmm3"); |
| 598 | &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 |
| 599 | &movaps ("xmm0",&QWP(0,"esp")); |
| 600 | &movaps ("xmm1",&QWP(16,"esp")); |
| 601 | &movaps ("xmm2",&QWP(32,"esp")); |
| 602 | &movaps ("xmm3",&QWP(48,"esp")); |
| 603 | &mov ("esp","edx"); # restore %esp |
| 604 | &mov ("edi",&wparam(0)); |
| 605 | &movups (&QWP(0,"edi"),"xmm0"); # copy-out context |
| 606 | &movups (&QWP(16,"edi"),"xmm1"); |
| 607 | &movups (&QWP(32,"edi"),"xmm2"); |
| 608 | &movups (&QWP(48,"edi"),"xmm3"); |
| 609 | &pop ("esi"); |
| 610 | &pop ("edi"); |
| 611 | &ret (); |
| 612 | &function_end_B("padlock_sha512_blocks"); |
| 613 | |
| 614 | &asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); |
| 615 | &align (16); |
| 616 | |
| 617 | &dataseg(); |
| 618 | # Essentially this variable belongs in thread local storage. |
| 619 | # Having this variable global on the other hand can only cause |
| 620 | # few bogus key reloads [if any at all on signle-CPU system], |
| 621 | # so we accept the penalty... |
| 622 | &set_label("padlock_saved_context",4); |
| 623 | &data_word(0); |
| 624 | |
| 625 | &asm_finish(); |
| 626 | |
| 627 | close STDOUT; |