rjw | 1f88458 | 2022-01-06 17:20:42 +0800 | [diff] [blame^] | 1 | /*
|
| 2 | * ====================================================================
|
| 3 | * Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
| 4 | * project. The module is, however, dual licensed under OpenSSL and
|
| 5 | * CRYPTOGAMS licenses depending on where you obtain it. For further
|
| 6 | * details see http://www.openssl.org/~appro/cryptogams/.
|
| 7 | * ====================================================================
|
| 8 | *
|
| 9 | * SHA256/512 for ARMv8.
|
| 10 | *
|
| 11 | * Performance in cycles per processed byte and improvement coefficient
|
| 12 | * over code generated with "default" compiler:
|
| 13 | *
|
| 14 | * SHA256-hw SHA256(*) SHA512
|
| 15 | * Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
|
| 16 | * Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
|
| 17 | * Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
|
| 18 | *
|
| 19 | * (*) Software SHA256 results are of lesser relevance, presented
|
| 20 | * mostly for informational purposes.
|
| 21 | * (**) The result is a trade-off: it's possible to improve it by
|
| 22 | * 10% (or by 1 cycle per round), but at the cost of 20% loss
|
| 23 | * on Cortex-A53 (or by 4 cycles per round).
|
| 24 | * (***) Super-impressive coefficients over gcc-generated code are
|
| 25 | * indication of some compiler "pathology", most notably code
|
| 26 | * generated with -mgeneral-regs-only is significanty faster
|
| 27 | * and lags behind assembly only by 50-90%.
|
| 28 | */
|
| 29 |
|
| 30 | .text
|
| 31 | .globl sha256_block_data_order
|
| 32 | .type sha256_block_data_order,%function
|
| 33 | .align 6
|
| 34 | .type .LK256,%object
|
| 35 | .LK256:
|
| 36 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
| 37 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
| 38 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
| 39 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
| 40 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
| 41 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
| 42 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
| 43 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
| 44 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
| 45 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
| 46 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
| 47 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
| 48 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
| 49 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
| 50 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
| 51 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
| 52 | .long 0 //terminator
|
| 53 | .size .LK256,.-.LK256
|
| 54 | .align 3
|
| 55 | .LOPENSSL_armcap_P:
|
| 56 | .quad OPENSSL_armcap_P-.
|
| 57 | .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
| 58 | .align 2
|
| 59 | .align 2
|
| 60 | .type sha256_block_data_order,%function
|
| 61 | .align 6
|
| 62 | sha256_block_data_order:
|
| 63 | .Lv8_entry:
|
| 64 | stp x29,x30,[sp,#-16]!
|
| 65 | add x29,sp,#0
|
| 66 |
|
| 67 | ld1 {v0.4s,v1.4s},[x0]
|
| 68 | adr x3,.LK256
|
| 69 |
|
| 70 | .Loop_hw:
|
| 71 | ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
|
| 72 | sub x2,x2,#1
|
| 73 | ld1 {v16.4s},[x3],#16
|
| 74 | rev32 v4.16b,v4.16b
|
| 75 | rev32 v5.16b,v5.16b
|
| 76 | rev32 v6.16b,v6.16b
|
| 77 | rev32 v7.16b,v7.16b
|
| 78 | orr v18.16b,v0.16b,v0.16b // offload
|
| 79 | orr v19.16b,v1.16b,v1.16b
|
| 80 | ld1 {v17.4s},[x3],#16
|
| 81 | add v16.4s,v16.4s,v4.4s
|
| 82 | .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
|
| 83 | orr v2.16b,v0.16b,v0.16b
|
| 84 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 85 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 86 | .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
|
| 87 | ld1 {v16.4s},[x3],#16
|
| 88 | add v17.4s,v17.4s,v5.4s
|
| 89 | .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
|
| 90 | orr v2.16b,v0.16b,v0.16b
|
| 91 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 92 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 93 | .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
|
| 94 | ld1 {v17.4s},[x3],#16
|
| 95 | add v16.4s,v16.4s,v6.4s
|
| 96 | .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
|
| 97 | orr v2.16b,v0.16b,v0.16b
|
| 98 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 99 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 100 | .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
|
| 101 | ld1 {v16.4s},[x3],#16
|
| 102 | add v17.4s,v17.4s,v7.4s
|
| 103 | .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
|
| 104 | orr v2.16b,v0.16b,v0.16b
|
| 105 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 106 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 107 | .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
|
| 108 | ld1 {v17.4s},[x3],#16
|
| 109 | add v16.4s,v16.4s,v4.4s
|
| 110 | .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
|
| 111 | orr v2.16b,v0.16b,v0.16b
|
| 112 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 113 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 114 | .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
|
| 115 | ld1 {v16.4s},[x3],#16
|
| 116 | add v17.4s,v17.4s,v5.4s
|
| 117 | .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
|
| 118 | orr v2.16b,v0.16b,v0.16b
|
| 119 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 120 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 121 | .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
|
| 122 | ld1 {v17.4s},[x3],#16
|
| 123 | add v16.4s,v16.4s,v6.4s
|
| 124 | .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
|
| 125 | orr v2.16b,v0.16b,v0.16b
|
| 126 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 127 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 128 | .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
|
| 129 | ld1 {v16.4s},[x3],#16
|
| 130 | add v17.4s,v17.4s,v7.4s
|
| 131 | .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
|
| 132 | orr v2.16b,v0.16b,v0.16b
|
| 133 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 134 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 135 | .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
|
| 136 | ld1 {v17.4s},[x3],#16
|
| 137 | add v16.4s,v16.4s,v4.4s
|
| 138 | .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
|
| 139 | orr v2.16b,v0.16b,v0.16b
|
| 140 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 141 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 142 | .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
|
| 143 | ld1 {v16.4s},[x3],#16
|
| 144 | add v17.4s,v17.4s,v5.4s
|
| 145 | .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
|
| 146 | orr v2.16b,v0.16b,v0.16b
|
| 147 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 148 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 149 | .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
|
| 150 | ld1 {v17.4s},[x3],#16
|
| 151 | add v16.4s,v16.4s,v6.4s
|
| 152 | .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
|
| 153 | orr v2.16b,v0.16b,v0.16b
|
| 154 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 155 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 156 | .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
|
| 157 | ld1 {v16.4s},[x3],#16
|
| 158 | add v17.4s,v17.4s,v7.4s
|
| 159 | .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
|
| 160 | orr v2.16b,v0.16b,v0.16b
|
| 161 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 162 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 163 | .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
|
| 164 | ld1 {v17.4s},[x3],#16
|
| 165 | add v16.4s,v16.4s,v4.4s
|
| 166 | orr v2.16b,v0.16b,v0.16b
|
| 167 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 168 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 169 |
|
| 170 | ld1 {v16.4s},[x3],#16
|
| 171 | add v17.4s,v17.4s,v5.4s
|
| 172 | orr v2.16b,v0.16b,v0.16b
|
| 173 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 174 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 175 |
|
| 176 | ld1 {v17.4s},[x3]
|
| 177 | add v16.4s,v16.4s,v6.4s
|
| 178 | sub x3,x3,#64*4-16 // rewind
|
| 179 | orr v2.16b,v0.16b,v0.16b
|
| 180 | .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
|
| 181 | .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
|
| 182 |
|
| 183 | add v17.4s,v17.4s,v7.4s
|
| 184 | orr v2.16b,v0.16b,v0.16b
|
| 185 | .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
|
| 186 | .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
|
| 187 |
|
| 188 | add v0.4s,v0.4s,v18.4s
|
| 189 | add v1.4s,v1.4s,v19.4s
|
| 190 |
|
| 191 | cbnz x2,.Loop_hw
|
| 192 |
|
| 193 | st1 {v0.4s,v1.4s},[x0]
|
| 194 |
|
| 195 | ldr x29,[sp],#16
|
| 196 | ret
|
| 197 | .size sha256_block_data_order,.-sha256_block_data_order
|
| 198 | .comm OPENSSL_armcap_P,4,4
|