blob: 5f7e43d4f64a0c08dd32e3ba767e1726b357452d [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001/*
2 * AES-NI + SSE2 implementation of AEGIS-128
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define STATE0 %xmm0
16#define STATE1 %xmm1
17#define STATE2 %xmm2
18#define STATE3 %xmm3
19#define STATE4 %xmm4
20#define KEY %xmm5
21#define MSG %xmm5
22#define T0 %xmm6
23#define T1 %xmm7
24
25#define STATEP %rdi
26#define LEN %rsi
27#define SRC %rdx
28#define DST %rcx
29
30.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
31.align 16
32.Laegis128_const_0:
33 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
34 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
35.Laegis128_const_1:
36 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
37 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
38
39.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
40.align 16
41.Laegis128_counter:
42 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
43 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
44
45.text
46
47/*
48 * aegis128_update
49 * input:
50 * STATE[0-4] - input state
51 * output:
52 * STATE[0-4] - output state (shifted positions)
53 * changed:
54 * T0
55 */
56.macro aegis128_update
57 movdqa STATE4, T0
58 aesenc STATE0, STATE4
59 aesenc STATE1, STATE0
60 aesenc STATE2, STATE1
61 aesenc STATE3, STATE2
62 aesenc T0, STATE3
63.endm
64
65/*
66 * __load_partial: internal ABI
67 * input:
68 * LEN - bytes
69 * SRC - src
70 * output:
71 * MSG - message block
72 * changed:
73 * T0
74 * %r8
75 * %r9
76 */
77__load_partial:
78 xor %r9d, %r9d
79 pxor MSG, MSG
80
81 mov LEN, %r8
82 and $0x1, %r8
83 jz .Lld_partial_1
84
85 mov LEN, %r8
86 and $0x1E, %r8
87 add SRC, %r8
88 mov (%r8), %r9b
89
90.Lld_partial_1:
91 mov LEN, %r8
92 and $0x2, %r8
93 jz .Lld_partial_2
94
95 mov LEN, %r8
96 and $0x1C, %r8
97 add SRC, %r8
98 shl $0x10, %r9
99 mov (%r8), %r9w
100
101.Lld_partial_2:
102 mov LEN, %r8
103 and $0x4, %r8
104 jz .Lld_partial_4
105
106 mov LEN, %r8
107 and $0x18, %r8
108 add SRC, %r8
109 shl $32, %r9
110 mov (%r8), %r8d
111 xor %r8, %r9
112
113.Lld_partial_4:
114 movq %r9, MSG
115
116 mov LEN, %r8
117 and $0x8, %r8
118 jz .Lld_partial_8
119
120 mov LEN, %r8
121 and $0x10, %r8
122 add SRC, %r8
123 pslldq $8, MSG
124 movq (%r8), T0
125 pxor T0, MSG
126
127.Lld_partial_8:
128 ret
129ENDPROC(__load_partial)
130
131/*
132 * __store_partial: internal ABI
133 * input:
134 * LEN - bytes
135 * DST - dst
136 * output:
137 * T0 - message block
138 * changed:
139 * %r8
140 * %r9
141 * %r10
142 */
143__store_partial:
144 mov LEN, %r8
145 mov DST, %r9
146
147 movq T0, %r10
148
149 cmp $8, %r8
150 jl .Lst_partial_8
151
152 mov %r10, (%r9)
153 psrldq $8, T0
154 movq T0, %r10
155
156 sub $8, %r8
157 add $8, %r9
158
159.Lst_partial_8:
160 cmp $4, %r8
161 jl .Lst_partial_4
162
163 mov %r10d, (%r9)
164 shr $32, %r10
165
166 sub $4, %r8
167 add $4, %r9
168
169.Lst_partial_4:
170 cmp $2, %r8
171 jl .Lst_partial_2
172
173 mov %r10w, (%r9)
174 shr $0x10, %r10
175
176 sub $2, %r8
177 add $2, %r9
178
179.Lst_partial_2:
180 cmp $1, %r8
181 jl .Lst_partial_1
182
183 mov %r10b, (%r9)
184
185.Lst_partial_1:
186 ret
187ENDPROC(__store_partial)
188
189/*
190 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
191 */
192ENTRY(crypto_aegis128_aesni_init)
193 FRAME_BEGIN
194
195 /* load IV: */
196 movdqu (%rdx), T1
197
198 /* load key: */
199 movdqa (%rsi), KEY
200 pxor KEY, T1
201 movdqa T1, STATE0
202 movdqa KEY, STATE3
203 movdqa KEY, STATE4
204
205 /* load the constants: */
206 movdqa .Laegis128_const_0, STATE2
207 movdqa .Laegis128_const_1, STATE1
208 pxor STATE2, STATE3
209 pxor STATE1, STATE4
210
211 /* update 10 times with KEY / KEY xor IV: */
212 aegis128_update; pxor KEY, STATE4
213 aegis128_update; pxor T1, STATE3
214 aegis128_update; pxor KEY, STATE2
215 aegis128_update; pxor T1, STATE1
216 aegis128_update; pxor KEY, STATE0
217 aegis128_update; pxor T1, STATE4
218 aegis128_update; pxor KEY, STATE3
219 aegis128_update; pxor T1, STATE2
220 aegis128_update; pxor KEY, STATE1
221 aegis128_update; pxor T1, STATE0
222
223 /* store the state: */
224 movdqu STATE0, 0x00(STATEP)
225 movdqu STATE1, 0x10(STATEP)
226 movdqu STATE2, 0x20(STATEP)
227 movdqu STATE3, 0x30(STATEP)
228 movdqu STATE4, 0x40(STATEP)
229
230 FRAME_END
231 ret
232ENDPROC(crypto_aegis128_aesni_init)
233
234/*
235 * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
236 * const void *data);
237 */
238ENTRY(crypto_aegis128_aesni_ad)
239 FRAME_BEGIN
240
241 cmp $0x10, LEN
242 jb .Lad_out
243
244 /* load the state: */
245 movdqu 0x00(STATEP), STATE0
246 movdqu 0x10(STATEP), STATE1
247 movdqu 0x20(STATEP), STATE2
248 movdqu 0x30(STATEP), STATE3
249 movdqu 0x40(STATEP), STATE4
250
251 mov SRC, %r8
252 and $0xF, %r8
253 jnz .Lad_u_loop
254
255.align 8
256.Lad_a_loop:
257 movdqa 0x00(SRC), MSG
258 aegis128_update
259 pxor MSG, STATE4
260 sub $0x10, LEN
261 cmp $0x10, LEN
262 jl .Lad_out_1
263
264 movdqa 0x10(SRC), MSG
265 aegis128_update
266 pxor MSG, STATE3
267 sub $0x10, LEN
268 cmp $0x10, LEN
269 jl .Lad_out_2
270
271 movdqa 0x20(SRC), MSG
272 aegis128_update
273 pxor MSG, STATE2
274 sub $0x10, LEN
275 cmp $0x10, LEN
276 jl .Lad_out_3
277
278 movdqa 0x30(SRC), MSG
279 aegis128_update
280 pxor MSG, STATE1
281 sub $0x10, LEN
282 cmp $0x10, LEN
283 jl .Lad_out_4
284
285 movdqa 0x40(SRC), MSG
286 aegis128_update
287 pxor MSG, STATE0
288 sub $0x10, LEN
289 cmp $0x10, LEN
290 jl .Lad_out_0
291
292 add $0x50, SRC
293 jmp .Lad_a_loop
294
295.align 8
296.Lad_u_loop:
297 movdqu 0x00(SRC), MSG
298 aegis128_update
299 pxor MSG, STATE4
300 sub $0x10, LEN
301 cmp $0x10, LEN
302 jl .Lad_out_1
303
304 movdqu 0x10(SRC), MSG
305 aegis128_update
306 pxor MSG, STATE3
307 sub $0x10, LEN
308 cmp $0x10, LEN
309 jl .Lad_out_2
310
311 movdqu 0x20(SRC), MSG
312 aegis128_update
313 pxor MSG, STATE2
314 sub $0x10, LEN
315 cmp $0x10, LEN
316 jl .Lad_out_3
317
318 movdqu 0x30(SRC), MSG
319 aegis128_update
320 pxor MSG, STATE1
321 sub $0x10, LEN
322 cmp $0x10, LEN
323 jl .Lad_out_4
324
325 movdqu 0x40(SRC), MSG
326 aegis128_update
327 pxor MSG, STATE0
328 sub $0x10, LEN
329 cmp $0x10, LEN
330 jl .Lad_out_0
331
332 add $0x50, SRC
333 jmp .Lad_u_loop
334
335 /* store the state: */
336.Lad_out_0:
337 movdqu STATE0, 0x00(STATEP)
338 movdqu STATE1, 0x10(STATEP)
339 movdqu STATE2, 0x20(STATEP)
340 movdqu STATE3, 0x30(STATEP)
341 movdqu STATE4, 0x40(STATEP)
342 FRAME_END
343 ret
344
345.Lad_out_1:
346 movdqu STATE4, 0x00(STATEP)
347 movdqu STATE0, 0x10(STATEP)
348 movdqu STATE1, 0x20(STATEP)
349 movdqu STATE2, 0x30(STATEP)
350 movdqu STATE3, 0x40(STATEP)
351 FRAME_END
352 ret
353
354.Lad_out_2:
355 movdqu STATE3, 0x00(STATEP)
356 movdqu STATE4, 0x10(STATEP)
357 movdqu STATE0, 0x20(STATEP)
358 movdqu STATE1, 0x30(STATEP)
359 movdqu STATE2, 0x40(STATEP)
360 FRAME_END
361 ret
362
363.Lad_out_3:
364 movdqu STATE2, 0x00(STATEP)
365 movdqu STATE3, 0x10(STATEP)
366 movdqu STATE4, 0x20(STATEP)
367 movdqu STATE0, 0x30(STATEP)
368 movdqu STATE1, 0x40(STATEP)
369 FRAME_END
370 ret
371
372.Lad_out_4:
373 movdqu STATE1, 0x00(STATEP)
374 movdqu STATE2, 0x10(STATEP)
375 movdqu STATE3, 0x20(STATEP)
376 movdqu STATE4, 0x30(STATEP)
377 movdqu STATE0, 0x40(STATEP)
378 FRAME_END
379 ret
380
381.Lad_out:
382 FRAME_END
383 ret
384ENDPROC(crypto_aegis128_aesni_ad)
385
386.macro encrypt_block a s0 s1 s2 s3 s4 i
387 movdq\a (\i * 0x10)(SRC), MSG
388 movdqa MSG, T0
389 pxor \s1, T0
390 pxor \s4, T0
391 movdqa \s2, T1
392 pand \s3, T1
393 pxor T1, T0
394 movdq\a T0, (\i * 0x10)(DST)
395
396 aegis128_update
397 pxor MSG, \s4
398
399 sub $0x10, LEN
400 cmp $0x10, LEN
401 jl .Lenc_out_\i
402.endm
403
404/*
405 * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
406 * const void *src, void *dst);
407 */
408ENTRY(crypto_aegis128_aesni_enc)
409 FRAME_BEGIN
410
411 cmp $0x10, LEN
412 jb .Lenc_out
413
414 /* load the state: */
415 movdqu 0x00(STATEP), STATE0
416 movdqu 0x10(STATEP), STATE1
417 movdqu 0x20(STATEP), STATE2
418 movdqu 0x30(STATEP), STATE3
419 movdqu 0x40(STATEP), STATE4
420
421 mov SRC, %r8
422 or DST, %r8
423 and $0xF, %r8
424 jnz .Lenc_u_loop
425
426.align 8
427.Lenc_a_loop:
428 encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
429 encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
430 encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
431 encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
432 encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
433
434 add $0x50, SRC
435 add $0x50, DST
436 jmp .Lenc_a_loop
437
438.align 8
439.Lenc_u_loop:
440 encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
441 encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
442 encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
443 encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
444 encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
445
446 add $0x50, SRC
447 add $0x50, DST
448 jmp .Lenc_u_loop
449
450 /* store the state: */
451.Lenc_out_0:
452 movdqu STATE4, 0x00(STATEP)
453 movdqu STATE0, 0x10(STATEP)
454 movdqu STATE1, 0x20(STATEP)
455 movdqu STATE2, 0x30(STATEP)
456 movdqu STATE3, 0x40(STATEP)
457 FRAME_END
458 ret
459
460.Lenc_out_1:
461 movdqu STATE3, 0x00(STATEP)
462 movdqu STATE4, 0x10(STATEP)
463 movdqu STATE0, 0x20(STATEP)
464 movdqu STATE1, 0x30(STATEP)
465 movdqu STATE2, 0x40(STATEP)
466 FRAME_END
467 ret
468
469.Lenc_out_2:
470 movdqu STATE2, 0x00(STATEP)
471 movdqu STATE3, 0x10(STATEP)
472 movdqu STATE4, 0x20(STATEP)
473 movdqu STATE0, 0x30(STATEP)
474 movdqu STATE1, 0x40(STATEP)
475 FRAME_END
476 ret
477
478.Lenc_out_3:
479 movdqu STATE1, 0x00(STATEP)
480 movdqu STATE2, 0x10(STATEP)
481 movdqu STATE3, 0x20(STATEP)
482 movdqu STATE4, 0x30(STATEP)
483 movdqu STATE0, 0x40(STATEP)
484 FRAME_END
485 ret
486
487.Lenc_out_4:
488 movdqu STATE0, 0x00(STATEP)
489 movdqu STATE1, 0x10(STATEP)
490 movdqu STATE2, 0x20(STATEP)
491 movdqu STATE3, 0x30(STATEP)
492 movdqu STATE4, 0x40(STATEP)
493 FRAME_END
494 ret
495
496.Lenc_out:
497 FRAME_END
498 ret
499ENDPROC(crypto_aegis128_aesni_enc)
500
501/*
502 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
503 * const void *src, void *dst);
504 */
505ENTRY(crypto_aegis128_aesni_enc_tail)
506 FRAME_BEGIN
507
508 /* load the state: */
509 movdqu 0x00(STATEP), STATE0
510 movdqu 0x10(STATEP), STATE1
511 movdqu 0x20(STATEP), STATE2
512 movdqu 0x30(STATEP), STATE3
513 movdqu 0x40(STATEP), STATE4
514
515 /* encrypt message: */
516 call __load_partial
517
518 movdqa MSG, T0
519 pxor STATE1, T0
520 pxor STATE4, T0
521 movdqa STATE2, T1
522 pand STATE3, T1
523 pxor T1, T0
524
525 call __store_partial
526
527 aegis128_update
528 pxor MSG, STATE4
529
530 /* store the state: */
531 movdqu STATE4, 0x00(STATEP)
532 movdqu STATE0, 0x10(STATEP)
533 movdqu STATE1, 0x20(STATEP)
534 movdqu STATE2, 0x30(STATEP)
535 movdqu STATE3, 0x40(STATEP)
536
537 FRAME_END
538 ret
539ENDPROC(crypto_aegis128_aesni_enc_tail)
540
541.macro decrypt_block a s0 s1 s2 s3 s4 i
542 movdq\a (\i * 0x10)(SRC), MSG
543 pxor \s1, MSG
544 pxor \s4, MSG
545 movdqa \s2, T1
546 pand \s3, T1
547 pxor T1, MSG
548 movdq\a MSG, (\i * 0x10)(DST)
549
550 aegis128_update
551 pxor MSG, \s4
552
553 sub $0x10, LEN
554 cmp $0x10, LEN
555 jl .Ldec_out_\i
556.endm
557
558/*
559 * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
560 * const void *src, void *dst);
561 */
562ENTRY(crypto_aegis128_aesni_dec)
563 FRAME_BEGIN
564
565 cmp $0x10, LEN
566 jb .Ldec_out
567
568 /* load the state: */
569 movdqu 0x00(STATEP), STATE0
570 movdqu 0x10(STATEP), STATE1
571 movdqu 0x20(STATEP), STATE2
572 movdqu 0x30(STATEP), STATE3
573 movdqu 0x40(STATEP), STATE4
574
575 mov SRC, %r8
576 or DST, %r8
577 and $0xF, %r8
578 jnz .Ldec_u_loop
579
580.align 8
581.Ldec_a_loop:
582 decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
583 decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
584 decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
585 decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
586 decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
587
588 add $0x50, SRC
589 add $0x50, DST
590 jmp .Ldec_a_loop
591
592.align 8
593.Ldec_u_loop:
594 decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
595 decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
596 decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
597 decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
598 decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
599
600 add $0x50, SRC
601 add $0x50, DST
602 jmp .Ldec_u_loop
603
604 /* store the state: */
605.Ldec_out_0:
606 movdqu STATE4, 0x00(STATEP)
607 movdqu STATE0, 0x10(STATEP)
608 movdqu STATE1, 0x20(STATEP)
609 movdqu STATE2, 0x30(STATEP)
610 movdqu STATE3, 0x40(STATEP)
611 FRAME_END
612 ret
613
614.Ldec_out_1:
615 movdqu STATE3, 0x00(STATEP)
616 movdqu STATE4, 0x10(STATEP)
617 movdqu STATE0, 0x20(STATEP)
618 movdqu STATE1, 0x30(STATEP)
619 movdqu STATE2, 0x40(STATEP)
620 FRAME_END
621 ret
622
623.Ldec_out_2:
624 movdqu STATE2, 0x00(STATEP)
625 movdqu STATE3, 0x10(STATEP)
626 movdqu STATE4, 0x20(STATEP)
627 movdqu STATE0, 0x30(STATEP)
628 movdqu STATE1, 0x40(STATEP)
629 FRAME_END
630 ret
631
632.Ldec_out_3:
633 movdqu STATE1, 0x00(STATEP)
634 movdqu STATE2, 0x10(STATEP)
635 movdqu STATE3, 0x20(STATEP)
636 movdqu STATE4, 0x30(STATEP)
637 movdqu STATE0, 0x40(STATEP)
638 FRAME_END
639 ret
640
641.Ldec_out_4:
642 movdqu STATE0, 0x00(STATEP)
643 movdqu STATE1, 0x10(STATEP)
644 movdqu STATE2, 0x20(STATEP)
645 movdqu STATE3, 0x30(STATEP)
646 movdqu STATE4, 0x40(STATEP)
647 FRAME_END
648 ret
649
650.Ldec_out:
651 FRAME_END
652 ret
653ENDPROC(crypto_aegis128_aesni_dec)
654
655/*
656 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
657 * const void *src, void *dst);
658 */
659ENTRY(crypto_aegis128_aesni_dec_tail)
660 FRAME_BEGIN
661
662 /* load the state: */
663 movdqu 0x00(STATEP), STATE0
664 movdqu 0x10(STATEP), STATE1
665 movdqu 0x20(STATEP), STATE2
666 movdqu 0x30(STATEP), STATE3
667 movdqu 0x40(STATEP), STATE4
668
669 /* decrypt message: */
670 call __load_partial
671
672 pxor STATE1, MSG
673 pxor STATE4, MSG
674 movdqa STATE2, T1
675 pand STATE3, T1
676 pxor T1, MSG
677
678 movdqa MSG, T0
679 call __store_partial
680
681 /* mask with byte count: */
682 movq LEN, T0
683 punpcklbw T0, T0
684 punpcklbw T0, T0
685 punpcklbw T0, T0
686 punpcklbw T0, T0
687 movdqa .Laegis128_counter, T1
688 pcmpgtb T1, T0
689 pand T0, MSG
690
691 aegis128_update
692 pxor MSG, STATE4
693
694 /* store the state: */
695 movdqu STATE4, 0x00(STATEP)
696 movdqu STATE0, 0x10(STATEP)
697 movdqu STATE1, 0x20(STATEP)
698 movdqu STATE2, 0x30(STATEP)
699 movdqu STATE3, 0x40(STATEP)
700
701 FRAME_END
702 ret
703ENDPROC(crypto_aegis128_aesni_dec_tail)
704
705/*
706 * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
707 * u64 assoclen, u64 cryptlen);
708 */
709ENTRY(crypto_aegis128_aesni_final)
710 FRAME_BEGIN
711
712 /* load the state: */
713 movdqu 0x00(STATEP), STATE0
714 movdqu 0x10(STATEP), STATE1
715 movdqu 0x20(STATEP), STATE2
716 movdqu 0x30(STATEP), STATE3
717 movdqu 0x40(STATEP), STATE4
718
719 /* prepare length block: */
720 movq %rdx, MSG
721 movq %rcx, T0
722 pslldq $8, T0
723 pxor T0, MSG
724 psllq $3, MSG /* multiply by 8 (to get bit count) */
725
726 pxor STATE3, MSG
727
728 /* update state: */
729 aegis128_update; pxor MSG, STATE4
730 aegis128_update; pxor MSG, STATE3
731 aegis128_update; pxor MSG, STATE2
732 aegis128_update; pxor MSG, STATE1
733 aegis128_update; pxor MSG, STATE0
734 aegis128_update; pxor MSG, STATE4
735 aegis128_update; pxor MSG, STATE3
736
737 /* xor tag: */
738 movdqu (%rsi), MSG
739
740 pxor STATE0, MSG
741 pxor STATE1, MSG
742 pxor STATE2, MSG
743 pxor STATE3, MSG
744 pxor STATE4, MSG
745
746 movdqu MSG, (%rsi)
747
748 FRAME_END
749 ret
750ENDPROC(crypto_aegis128_aesni_final)