/* | |
* ==================================================================== | |
* Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
* project. The module is, however, dual licensed under OpenSSL and | |
* CRYPTOGAMS licenses depending on where you obtain it. For further | |
* details see http://www.openssl.org/~appro/cryptogams/. | |
* ==================================================================== | |
* | |
* SHA256/512 for ARMv8. | |
* | |
* Performance in cycles per processed byte and improvement coefficient | |
* over code generated with "default" compiler: | |
* | |
* SHA256-hw SHA256(*) SHA512 | |
* Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) | |
* Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***)) | |
* Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) | |
* | |
* (*) Software SHA256 results are of lesser relevance, presented | |
* mostly for informational purposes. | |
* (**) The result is a trade-off: it's possible to improve it by | |
* 10% (or by 1 cycle per round), but at the cost of 20% loss | |
* on Cortex-A53 (or by 4 cycles per round). | |
* (***) Super-impressive coefficients over gcc-generated code are | |
* indication of some compiler "pathology", most notably code | |
* generated with -mgeneral-regs-only is significanty faster | |
* and lags behind assembly only by 50-90%. | |
*/ | |
.text | |
.globl sha256_block_data_order | |
.type sha256_block_data_order,%function | |
.align 6 | |
.type .LK256,%object | |
.LK256: | |
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
.long 0 //terminator | |
.size .LK256,.-.LK256 | |
.align 3 | |
.LOPENSSL_armcap_P: | |
.quad OPENSSL_armcap_P-. | |
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 | |
.align 2 | |
.align 2 | |
.type sha256_block_data_order,%function | |
.align 6 | |
sha256_block_data_order: | |
.Lv8_entry: | |
stp x29,x30,[sp,#-16]! | |
add x29,sp,#0 | |
ld1 {v0.4s,v1.4s},[x0] | |
adr x3,.LK256 | |
.Loop_hw: | |
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 | |
sub x2,x2,#1 | |
ld1 {v16.4s},[x3],#16 | |
rev32 v4.16b,v4.16b | |
rev32 v5.16b,v5.16b | |
rev32 v6.16b,v6.16b | |
rev32 v7.16b,v7.16b | |
orr v18.16b,v0.16b,v0.16b // offload | |
orr v19.16b,v1.16b,v1.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v4.4s | |
.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v5.4s | |
.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v6.4s | |
.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v7.4s | |
.inst 0x5e282887 //sha256su0 v7.16b,v4.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v4.4s | |
.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v5.4s | |
.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v6.4s | |
.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v7.4s | |
.inst 0x5e282887 //sha256su0 v7.16b,v4.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v4.4s | |
.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v5.4s | |
.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v6.4s | |
.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v7.4s | |
.inst 0x5e282887 //sha256su0 v7.16b,v4.16b | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b | |
ld1 {v17.4s},[x3],#16 | |
add v16.4s,v16.4s,v4.4s | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
ld1 {v16.4s},[x3],#16 | |
add v17.4s,v17.4s,v5.4s | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
ld1 {v17.4s},[x3] | |
add v16.4s,v16.4s,v6.4s | |
sub x3,x3,#64*4-16 // rewind | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s | |
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s | |
add v17.4s,v17.4s,v7.4s | |
orr v2.16b,v0.16b,v0.16b | |
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s | |
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s | |
add v0.4s,v0.4s,v18.4s | |
add v1.4s,v1.4s,v19.4s | |
cbnz x2,.Loop_hw | |
st1 {v0.4s,v1.4s},[x0] | |
ldr x29,[sp],#16 | |
ret | |
.size sha256_block_data_order,.-sha256_block_data_order | |
.comm OPENSSL_armcap_P,4,4 |