blob: 28be18a4cd1e65814c398677ab205521f709e23a [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001/*
2 * ====================================================================
3 * Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4 * project. The module is, however, dual licensed under OpenSSL and
5 * CRYPTOGAMS licenses depending on where you obtain it. For further
6 * details see http://www.openssl.org/~appro/cryptogams/.
7 * ====================================================================
8 *
9 * SHA256/512 for ARMv8.
10 *
11 * Performance in cycles per processed byte and improvement coefficient
12 * over code generated with "default" compiler:
13 *
14 * SHA256-hw SHA256(*) SHA512
15 * Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
16 * Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
17 * Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
18 *
19 * (*) Software SHA256 results are of lesser relevance, presented
20 * mostly for informational purposes.
21 * (**) The result is a trade-off: it's possible to improve it by
22 * 10% (or by 1 cycle per round), but at the cost of 20% loss
23 * on Cortex-A53 (or by 4 cycles per round).
24 * (***) Super-impressive coefficients over gcc-generated code are
25 * indication of some compiler "pathology", most notably code
26 * generated with -mgeneral-regs-only is significanty faster
27 * and lags behind assembly only by 50-90%.
28 */
29
30.text
31.globl sha256_block_data_order
32.type sha256_block_data_order,%function
33.align 6
34.type .LK256,%object
35.LK256:
36.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
52.long 0 //terminator
53.size .LK256,.-.LK256
54.align 3
55.LOPENSSL_armcap_P:
56.quad OPENSSL_armcap_P-.
57.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
58.align 2
59.align 2
60.type sha256_block_data_order,%function
61.align 6
62sha256_block_data_order:
63.Lv8_entry:
64 stp x29,x30,[sp,#-16]!
65 add x29,sp,#0
66
67 ld1 {v0.4s,v1.4s},[x0]
68 adr x3,.LK256
69
70.Loop_hw:
71 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
72 sub x2,x2,#1
73 ld1 {v16.4s},[x3],#16
74 rev32 v4.16b,v4.16b
75 rev32 v5.16b,v5.16b
76 rev32 v6.16b,v6.16b
77 rev32 v7.16b,v7.16b
78 orr v18.16b,v0.16b,v0.16b // offload
79 orr v19.16b,v1.16b,v1.16b
80 ld1 {v17.4s},[x3],#16
81 add v16.4s,v16.4s,v4.4s
82.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
83 orr v2.16b,v0.16b,v0.16b
84.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
85.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
86.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
87 ld1 {v16.4s},[x3],#16
88 add v17.4s,v17.4s,v5.4s
89.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
90 orr v2.16b,v0.16b,v0.16b
91.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
92.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
93.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
94 ld1 {v17.4s},[x3],#16
95 add v16.4s,v16.4s,v6.4s
96.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
97 orr v2.16b,v0.16b,v0.16b
98.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
99.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
100.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
101 ld1 {v16.4s},[x3],#16
102 add v17.4s,v17.4s,v7.4s
103.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
104 orr v2.16b,v0.16b,v0.16b
105.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
106.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
107.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
108 ld1 {v17.4s},[x3],#16
109 add v16.4s,v16.4s,v4.4s
110.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
111 orr v2.16b,v0.16b,v0.16b
112.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
113.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
114.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
115 ld1 {v16.4s},[x3],#16
116 add v17.4s,v17.4s,v5.4s
117.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
118 orr v2.16b,v0.16b,v0.16b
119.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
120.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
121.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
122 ld1 {v17.4s},[x3],#16
123 add v16.4s,v16.4s,v6.4s
124.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
125 orr v2.16b,v0.16b,v0.16b
126.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
127.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
128.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
129 ld1 {v16.4s},[x3],#16
130 add v17.4s,v17.4s,v7.4s
131.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
132 orr v2.16b,v0.16b,v0.16b
133.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
134.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
135.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
136 ld1 {v17.4s},[x3],#16
137 add v16.4s,v16.4s,v4.4s
138.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
139 orr v2.16b,v0.16b,v0.16b
140.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
141.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
142.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
143 ld1 {v16.4s},[x3],#16
144 add v17.4s,v17.4s,v5.4s
145.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
146 orr v2.16b,v0.16b,v0.16b
147.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
148.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
149.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
150 ld1 {v17.4s},[x3],#16
151 add v16.4s,v16.4s,v6.4s
152.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
153 orr v2.16b,v0.16b,v0.16b
154.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
155.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
156.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
157 ld1 {v16.4s},[x3],#16
158 add v17.4s,v17.4s,v7.4s
159.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
160 orr v2.16b,v0.16b,v0.16b
161.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
162.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
163.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
164 ld1 {v17.4s},[x3],#16
165 add v16.4s,v16.4s,v4.4s
166 orr v2.16b,v0.16b,v0.16b
167.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
168.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
169
170 ld1 {v16.4s},[x3],#16
171 add v17.4s,v17.4s,v5.4s
172 orr v2.16b,v0.16b,v0.16b
173.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
174.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
175
176 ld1 {v17.4s},[x3]
177 add v16.4s,v16.4s,v6.4s
178 sub x3,x3,#64*4-16 // rewind
179 orr v2.16b,v0.16b,v0.16b
180.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
181.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
182
183 add v17.4s,v17.4s,v7.4s
184 orr v2.16b,v0.16b,v0.16b
185.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
186.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
187
188 add v0.4s,v0.4s,v18.4s
189 add v1.4s,v1.4s,v19.4s
190
191 cbnz x2,.Loop_hw
192
193 st1 {v0.4s,v1.4s},[x0]
194
195 ldr x29,[sp],#16
196 ret
197.size sha256_block_data_order,.-sha256_block_data_order
198.comm OPENSSL_armcap_P,4,4