blob: 2674d43d1384b87614074a6e02e5c1305545c8bd [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13 .text
14 .align 4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare - setup NEON registers for encryption
26 * - dec_prepare - setup NEON registers for decryption
27 * - enc_switch_key - change to new key after having prepared for encryption
28 * - encrypt_block - encrypt a single block
29 * - decrypt block - decrypt a single block
30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43 encrypt_block2x v0, v1, w3, x2, x6, w7
44 ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48 decrypt_block2x v0, v1, w3, x2, x6, w7
49 ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56 ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61 ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68 .macro do_encrypt_block2x
69 bl aes_encrypt_block2x
70 .endm
71
72 .macro do_decrypt_block2x
73 bl aes_decrypt_block2x
74 .endm
75
76 .macro do_encrypt_block4x
77 bl aes_encrypt_block4x
78 .endm
79
80 .macro do_decrypt_block4x
81 bl aes_decrypt_block4x
82 .endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88 .macro do_encrypt_block2x
89 encrypt_block2x v0, v1, w3, x2, x6, w7
90 .endm
91
92 .macro do_decrypt_block2x
93 decrypt_block2x v0, v1, w3, x2, x6, w7
94 .endm
95
96 .macro do_encrypt_block4x
97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98 .endm
99
100 .macro do_decrypt_block4x
101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102 .endm
103
104#endif
105
106 /*
107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108 * int blocks, int first)
109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110 * int blocks, int first)
111 */
112
113AES_ENTRY(aes_ecb_encrypt)
114 FRAME_PUSH
115 cbz w5, .LecbencloopNx
116
117 enc_prepare w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121 subs w4, w4, #INTERLEAVE
122 bmi .Lecbenc1x
123#if INTERLEAVE == 2
124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
125 do_encrypt_block2x
126 st1 {v0.16b-v1.16b}, [x0], #32
127#else
128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
129 do_encrypt_block4x
130 st1 {v0.16b-v3.16b}, [x0], #64
131#endif
132 b .LecbencloopNx
133.Lecbenc1x:
134 adds w4, w4, #INTERLEAVE
135 beq .Lecbencout
136#endif
137.Lecbencloop:
138 ld1 {v0.16b}, [x1], #16 /* get next pt block */
139 encrypt_block v0, w3, x2, x5, w6
140 st1 {v0.16b}, [x0], #16
141 subs w4, w4, #1
142 bne .Lecbencloop
143.Lecbencout:
144 FRAME_POP
145 ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150 FRAME_PUSH
151 cbz w5, .LecbdecloopNx
152
153 dec_prepare w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157 subs w4, w4, #INTERLEAVE
158 bmi .Lecbdec1x
159#if INTERLEAVE == 2
160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
161 do_decrypt_block2x
162 st1 {v0.16b-v1.16b}, [x0], #32
163#else
164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
165 do_decrypt_block4x
166 st1 {v0.16b-v3.16b}, [x0], #64
167#endif
168 b .LecbdecloopNx
169.Lecbdec1x:
170 adds w4, w4, #INTERLEAVE
171 beq .Lecbdecout
172#endif
173.Lecbdecloop:
174 ld1 {v0.16b}, [x1], #16 /* get next ct block */
175 decrypt_block v0, w3, x2, x5, w6
176 st1 {v0.16b}, [x0], #16
177 subs w4, w4, #1
178 bne .Lecbdecloop
179.Lecbdecout:
180 FRAME_POP
181 ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185 /*
186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187 * int blocks, u8 iv[], int first)
188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189 * int blocks, u8 iv[], int first)
190 */
191
192AES_ENTRY(aes_cbc_encrypt)
193 cbz w6, .Lcbcencloop
194
195 ld1 {v0.16b}, [x5] /* get iv */
196 enc_prepare w3, x2, x6
197
198.Lcbcencloop:
199 ld1 {v1.16b}, [x1], #16 /* get next pt block */
200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
201 encrypt_block v0, w3, x2, x6, w7
202 st1 {v0.16b}, [x0], #16
203 subs w4, w4, #1
204 bne .Lcbcencloop
205 st1 {v0.16b}, [x5] /* return iv */
206 ret
207AES_ENDPROC(aes_cbc_encrypt)
208
209
210AES_ENTRY(aes_cbc_decrypt)
211 FRAME_PUSH
212 cbz w6, .LcbcdecloopNx
213
214 ld1 {v7.16b}, [x5] /* get iv */
215 dec_prepare w3, x2, x6
216
217.LcbcdecloopNx:
218#if INTERLEAVE >= 2
219 subs w4, w4, #INTERLEAVE
220 bmi .Lcbcdec1x
221#if INTERLEAVE == 2
222 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
223 mov v2.16b, v0.16b
224 mov v3.16b, v1.16b
225 do_decrypt_block2x
226 eor v0.16b, v0.16b, v7.16b
227 eor v1.16b, v1.16b, v2.16b
228 mov v7.16b, v3.16b
229 st1 {v0.16b-v1.16b}, [x0], #32
230#else
231 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
232 mov v4.16b, v0.16b
233 mov v5.16b, v1.16b
234 mov v6.16b, v2.16b
235 do_decrypt_block4x
236 sub x1, x1, #16
237 eor v0.16b, v0.16b, v7.16b
238 eor v1.16b, v1.16b, v4.16b
239 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
240 eor v2.16b, v2.16b, v5.16b
241 eor v3.16b, v3.16b, v6.16b
242 st1 {v0.16b-v3.16b}, [x0], #64
243#endif
244 b .LcbcdecloopNx
245.Lcbcdec1x:
246 adds w4, w4, #INTERLEAVE
247 beq .Lcbcdecout
248#endif
249.Lcbcdecloop:
250 ld1 {v1.16b}, [x1], #16 /* get next ct block */
251 mov v0.16b, v1.16b /* ...and copy to v0 */
252 decrypt_block v0, w3, x2, x6, w7
253 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
254 mov v7.16b, v1.16b /* ct is next iv */
255 st1 {v0.16b}, [x0], #16
256 subs w4, w4, #1
257 bne .Lcbcdecloop
258.Lcbcdecout:
259 FRAME_POP
260 st1 {v7.16b}, [x5] /* return iv */
261 ret
262AES_ENDPROC(aes_cbc_decrypt)
263
264
265 /*
266 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
267 * int blocks, u8 ctr[], int first)
268 */
269
270AES_ENTRY(aes_ctr_encrypt)
271 FRAME_PUSH
272 cbz w6, .Lctrnotfirst /* 1st time around? */
273 enc_prepare w3, x2, x6
274 ld1 {v4.16b}, [x5]
275
276.Lctrnotfirst:
277 umov x8, v4.d[1] /* keep swabbed ctr in reg */
278 rev x8, x8
279#if INTERLEAVE >= 2
280 cmn w8, w4 /* 32 bit overflow? */
281 bcs .Lctrloop
282.LctrloopNx:
283 subs w4, w4, #INTERLEAVE
284 bmi .Lctr1x
285#if INTERLEAVE == 2
286 mov v0.8b, v4.8b
287 mov v1.8b, v4.8b
288 rev x7, x8
289 add x8, x8, #1
290 ins v0.d[1], x7
291 rev x7, x8
292 add x8, x8, #1
293 ins v1.d[1], x7
294 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
295 do_encrypt_block2x
296 eor v0.16b, v0.16b, v2.16b
297 eor v1.16b, v1.16b, v3.16b
298 st1 {v0.16b-v1.16b}, [x0], #32
299#else
300 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
301 dup v7.4s, w8
302 mov v0.16b, v4.16b
303 add v7.4s, v7.4s, v8.4s
304 mov v1.16b, v4.16b
305 rev32 v8.16b, v7.16b
306 mov v2.16b, v4.16b
307 mov v3.16b, v4.16b
308 mov v1.s[3], v8.s[0]
309 mov v2.s[3], v8.s[1]
310 mov v3.s[3], v8.s[2]
311 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
312 do_encrypt_block4x
313 eor v0.16b, v5.16b, v0.16b
314 ld1 {v5.16b}, [x1], #16 /* get 1 input block */
315 eor v1.16b, v6.16b, v1.16b
316 eor v2.16b, v7.16b, v2.16b
317 eor v3.16b, v5.16b, v3.16b
318 st1 {v0.16b-v3.16b}, [x0], #64
319 add x8, x8, #INTERLEAVE
320#endif
321 rev x7, x8
322 ins v4.d[1], x7
323 cbz w4, .Lctrout
324 b .LctrloopNx
325.Lctr1x:
326 adds w4, w4, #INTERLEAVE
327 beq .Lctrout
328#endif
329.Lctrloop:
330 mov v0.16b, v4.16b
331 encrypt_block v0, w3, x2, x6, w7
332
333 adds x8, x8, #1 /* increment BE ctr */
334 rev x7, x8
335 ins v4.d[1], x7
336 bcs .Lctrcarry /* overflow? */
337
338.Lctrcarrydone:
339 subs w4, w4, #1
340 bmi .Lctrtailblock /* blocks <0 means tail block */
341 ld1 {v3.16b}, [x1], #16
342 eor v3.16b, v0.16b, v3.16b
343 st1 {v3.16b}, [x0], #16
344 bne .Lctrloop
345
346.Lctrout:
347 st1 {v4.16b}, [x5] /* return next CTR value */
348 FRAME_POP
349 ret
350
351.Lctrtailblock:
352 st1 {v0.16b}, [x0]
353 FRAME_POP
354 ret
355
356.Lctrcarry:
357 umov x7, v4.d[0] /* load upper word of ctr */
358 rev x7, x7 /* ... to handle the carry */
359 add x7, x7, #1
360 rev x7, x7
361 ins v4.d[0], x7
362 b .Lctrcarrydone
363AES_ENDPROC(aes_ctr_encrypt)
364 .ltorg
365
366
367 /*
368 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
369 * int blocks, u8 const rk2[], u8 iv[], int first)
370 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
371 * int blocks, u8 const rk2[], u8 iv[], int first)
372 */
373
374 .macro next_tweak, out, in, const, tmp
375 sshr \tmp\().2d, \in\().2d, #63
376 and \tmp\().16b, \tmp\().16b, \const\().16b
377 add \out\().2d, \in\().2d, \in\().2d
378 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
379 eor \out\().16b, \out\().16b, \tmp\().16b
380 .endm
381
382.Lxts_mul_x:
383CPU_LE( .quad 1, 0x87 )
384CPU_BE( .quad 0x87, 1 )
385
386AES_ENTRY(aes_xts_encrypt)
387 FRAME_PUSH
388 cbz w7, .LxtsencloopNx
389
390 ld1 {v4.16b}, [x6]
391 enc_prepare w3, x5, x6
392 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
393 enc_switch_key w3, x2, x6
394 ldr q7, .Lxts_mul_x
395 b .LxtsencNx
396
397.LxtsencloopNx:
398 ldr q7, .Lxts_mul_x
399 next_tweak v4, v4, v7, v8
400.LxtsencNx:
401#if INTERLEAVE >= 2
402 subs w4, w4, #INTERLEAVE
403 bmi .Lxtsenc1x
404#if INTERLEAVE == 2
405 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
406 next_tweak v5, v4, v7, v8
407 eor v0.16b, v0.16b, v4.16b
408 eor v1.16b, v1.16b, v5.16b
409 do_encrypt_block2x
410 eor v0.16b, v0.16b, v4.16b
411 eor v1.16b, v1.16b, v5.16b
412 st1 {v0.16b-v1.16b}, [x0], #32
413 cbz w4, .LxtsencoutNx
414 next_tweak v4, v5, v7, v8
415 b .LxtsencNx
416.LxtsencoutNx:
417 mov v4.16b, v5.16b
418 b .Lxtsencout
419#else
420 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
421 next_tweak v5, v4, v7, v8
422 eor v0.16b, v0.16b, v4.16b
423 next_tweak v6, v5, v7, v8
424 eor v1.16b, v1.16b, v5.16b
425 eor v2.16b, v2.16b, v6.16b
426 next_tweak v7, v6, v7, v8
427 eor v3.16b, v3.16b, v7.16b
428 do_encrypt_block4x
429 eor v3.16b, v3.16b, v7.16b
430 eor v0.16b, v0.16b, v4.16b
431 eor v1.16b, v1.16b, v5.16b
432 eor v2.16b, v2.16b, v6.16b
433 st1 {v0.16b-v3.16b}, [x0], #64
434 mov v4.16b, v7.16b
435 cbz w4, .Lxtsencout
436 b .LxtsencloopNx
437#endif
438.Lxtsenc1x:
439 adds w4, w4, #INTERLEAVE
440 beq .Lxtsencout
441#endif
442.Lxtsencloop:
443 ld1 {v1.16b}, [x1], #16
444 eor v0.16b, v1.16b, v4.16b
445 encrypt_block v0, w3, x2, x6, w7
446 eor v0.16b, v0.16b, v4.16b
447 st1 {v0.16b}, [x0], #16
448 subs w4, w4, #1
449 beq .Lxtsencout
450 next_tweak v4, v4, v7, v8
451 b .Lxtsencloop
452.Lxtsencout:
453 FRAME_POP
454 ret
455AES_ENDPROC(aes_xts_encrypt)
456
457
458AES_ENTRY(aes_xts_decrypt)
459 FRAME_PUSH
460 cbz w7, .LxtsdecloopNx
461
462 ld1 {v4.16b}, [x6]
463 enc_prepare w3, x5, x6
464 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
465 dec_prepare w3, x2, x6
466 ldr q7, .Lxts_mul_x
467 b .LxtsdecNx
468
469.LxtsdecloopNx:
470 ldr q7, .Lxts_mul_x
471 next_tweak v4, v4, v7, v8
472.LxtsdecNx:
473#if INTERLEAVE >= 2
474 subs w4, w4, #INTERLEAVE
475 bmi .Lxtsdec1x
476#if INTERLEAVE == 2
477 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
478 next_tweak v5, v4, v7, v8
479 eor v0.16b, v0.16b, v4.16b
480 eor v1.16b, v1.16b, v5.16b
481 do_decrypt_block2x
482 eor v0.16b, v0.16b, v4.16b
483 eor v1.16b, v1.16b, v5.16b
484 st1 {v0.16b-v1.16b}, [x0], #32
485 cbz w4, .LxtsdecoutNx
486 next_tweak v4, v5, v7, v8
487 b .LxtsdecNx
488.LxtsdecoutNx:
489 mov v4.16b, v5.16b
490 b .Lxtsdecout
491#else
492 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
493 next_tweak v5, v4, v7, v8
494 eor v0.16b, v0.16b, v4.16b
495 next_tweak v6, v5, v7, v8
496 eor v1.16b, v1.16b, v5.16b
497 eor v2.16b, v2.16b, v6.16b
498 next_tweak v7, v6, v7, v8
499 eor v3.16b, v3.16b, v7.16b
500 do_decrypt_block4x
501 eor v3.16b, v3.16b, v7.16b
502 eor v0.16b, v0.16b, v4.16b
503 eor v1.16b, v1.16b, v5.16b
504 eor v2.16b, v2.16b, v6.16b
505 st1 {v0.16b-v3.16b}, [x0], #64
506 mov v4.16b, v7.16b
507 cbz w4, .Lxtsdecout
508 b .LxtsdecloopNx
509#endif
510.Lxtsdec1x:
511 adds w4, w4, #INTERLEAVE
512 beq .Lxtsdecout
513#endif
514.Lxtsdecloop:
515 ld1 {v1.16b}, [x1], #16
516 eor v0.16b, v1.16b, v4.16b
517 decrypt_block v0, w3, x2, x6, w7
518 eor v0.16b, v0.16b, v4.16b
519 st1 {v0.16b}, [x0], #16
520 subs w4, w4, #1
521 beq .Lxtsdecout
522 next_tweak v4, v4, v7, v8
523 b .Lxtsdecloop
524.Lxtsdecout:
525 FRAME_POP
526 ret
527AES_ENDPROC(aes_xts_decrypt)
528
529 /*
530 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
531 * int blocks, u8 dg[], int enc_before, int enc_after)
532 */
533AES_ENTRY(aes_mac_update)
534 ld1 {v0.16b}, [x4] /* get dg */
535 enc_prepare w2, x1, x7
536 cbnz w5, .Lmacenc
537
538.Lmacloop:
539 cbz w3, .Lmacout
540 ld1 {v1.16b}, [x0], #16 /* get next pt block */
541 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
542
543 subs w3, w3, #1
544 csinv x5, x6, xzr, eq
545 cbz w5, .Lmacout
546
547.Lmacenc:
548 encrypt_block v0, w2, x1, x7, w8
549 b .Lmacloop
550
551.Lmacout:
552 st1 {v0.16b}, [x4] /* return dg */
553 ret
554AES_ENDPROC(aes_mac_update)