ASR_BASE
Change-Id: Icf3719cc0afe3eeb3edc7fa80a2eb5199ca9dda1
diff --git a/marvell/linux/arch/x86/crypto/Makefile b/marvell/linux/arch/x86/crypto/Makefile
new file mode 100644
index 0000000..759b1a9
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/Makefile
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+OBJECT_FILES_NON_STANDARD := y
+
+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
+
+obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
+
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+
+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+
+# These modules require assembler to support AVX.
+ifeq ($(avx_supported),yes)
+ obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
+ camellia-aesni-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+endif
+
+# These modules require assembler to support AVX2.
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+ obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+endif
+
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
+camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+
+ifeq ($(avx_supported),yes)
+ camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+ camellia_aesni_avx_glue.o
+ cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+ twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
+ twofish_avx_glue.o
+ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
+ serpent_avx_glue.o
+endif
+
+ifeq ($(avx2_supported),yes)
+ camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+ chacha-x86_64-y += chacha-avx2-x86_64.o
+ serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+
+ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
+endif
+
+ifeq ($(avx512_supported),yes)
+ chacha-x86_64-y += chacha-avx512vl-x86_64.o
+endif
+
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
+ifeq ($(avx2_supported),yes)
+sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+poly1305-x86_64-y += poly1305-avx2-x86_64.o
+endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
+endif
+crc32c-intel-y := crc32c-intel_glue.o
+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
+crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/marvell/linux/arch/x86/crypto/aegis128-aesni-asm.S b/marvell/linux/arch/x86/crypto/aegis128-aesni-asm.S
new file mode 100644
index 0000000..19aee5a
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aegis128-aesni-asm.S
@@ -0,0 +1,748 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY %xmm5
+#define MSG %xmm5
+#define T0 %xmm6
+#define T1 %xmm7
+
+#define STATEP %rdi
+#define LEN %esi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+.align 16
+.Laegis128_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
+.align 16
+.Laegis128_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * aegis128_update
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state (shifted positions)
+ * changed:
+ * T0
+ */
+.macro aegis128_update
+ movdqa STATE4, T0
+ aesenc STATE0, STATE4
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc T0, STATE3
+.endm
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ pxor MSG, MSG
+
+ mov LEN, %r8d
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8d
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8d
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8d
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8d
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8d
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov LEN, %r8d
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8d
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8d
+ mov DST, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128_aesni_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ movdqu (%rdx), T1
+
+ /* load key: */
+ movdqa (%rsi), KEY
+ pxor KEY, T1
+ movdqa T1, STATE0
+ movdqa KEY, STATE3
+ movdqa KEY, STATE4
+
+ /* load the constants: */
+ movdqa .Laegis128_const_0, STATE2
+ movdqa .Laegis128_const_1, STATE1
+ pxor STATE2, STATE3
+ pxor STATE1, STATE4
+
+ /* update 10 times with KEY / KEY xor IV: */
+ aegis128_update; pxor KEY, STATE4
+ aegis128_update; pxor T1, STATE3
+ aegis128_update; pxor KEY, STATE2
+ aegis128_update; pxor T1, STATE1
+ aegis128_update; pxor KEY, STATE0
+ aegis128_update; pxor T1, STATE4
+ aegis128_update; pxor KEY, STATE3
+ aegis128_update; pxor T1, STATE2
+ aegis128_update; pxor KEY, STATE1
+ aegis128_update; pxor T1, STATE0
+
+ /* store the state: */
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_init)
+
+/*
+ * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis128_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ movdqa 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_1
+
+ movdqa 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_2
+
+ movdqa 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_3
+
+ movdqa 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_4
+
+ movdqa 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ movdqu 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_1
+
+ movdqu 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_2
+
+ movdqu 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_3
+
+ movdqu 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_4
+
+ movdqu 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_u_loop
+
+ /* store the state: */
+.Lad_out_0:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_ad)
+
+.macro encrypt_block a s0 s1 s2 s3 s4 i
+ movdq\a (\i * 0x10)(SRC), MSG
+ movdqa MSG, T0
+ pxor \s1, T0
+ pxor \s4, T0
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, T0
+ movdq\a T0, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lenc_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_u_loop
+
+ /* store the state: */
+.Lenc_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_enc)
+
+/*
+ * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ pxor STATE1, T0
+ pxor STATE4, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+
+ call __store_partial
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_enc_tail)
+
+.macro decrypt_block a s0 s1 s2 s3 s4 i
+ movdq\a (\i * 0x10)(SRC), MSG
+ pxor \s1, MSG
+ pxor \s4, MSG
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, MSG
+ movdq\a MSG, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_u_loop
+
+ /* store the state: */
+.Ldec_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_dec)
+
+/*
+ * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE1, MSG
+ pxor STATE4, MSG
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, MSG
+
+ movdqa MSG, T0
+ call __store_partial
+
+ /* mask with byte count: */
+ movd LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Laegis128_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
+ * unsigned int assoclen,
+ * unsigned int cryptlen);
+ */
+ENTRY(crypto_aegis128_aesni_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* prepare length block: */
+ movd %edx, MSG
+ movd %ecx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ pxor STATE3, MSG
+
+ /* update state: */
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+ aegis128_update; pxor MSG, STATE2
+ aegis128_update; pxor MSG, STATE1
+ aegis128_update; pxor MSG, STATE0
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pxor STATE1, MSG
+ pxor STATE2, MSG
+ pxor STATE3, MSG
+ pxor STATE4, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_final)
diff --git a/marvell/linux/arch/x86/crypto/aegis128-aesni-glue.c b/marvell/linux/arch/x86/crypto/aegis128-aesni-glue.c
new file mode 100644
index 0000000..46d2271
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aegis128-aesni-glue.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The AEGIS-128 Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128_BLOCK_ALIGN 16
+#define AEGIS128_BLOCK_SIZE 16
+#define AEGIS128_NONCE_SIZE 16
+#define AEGIS128_STATE_BLOCKS 5
+#define AEGIS128_KEY_SIZE 16
+#define AEGIS128_MIN_AUTH_SIZE 8
+#define AEGIS128_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS128_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis128_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS128_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis128_aesni_ad(state,
+ AEGIS128_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis128_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+ left &= AEGIS128_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+ crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis128_aesni_process_crypt(
+ struct aegis_state *state, struct skcipher_walk *walk,
+ const struct aegis_crypt_ops *ops)
+{
+ while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
+ ops->crypt_blocks(state,
+ round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ }
+
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
+
+ if (keylen != AEGIS128_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS128_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS128_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ struct skcipher_walk walk;
+ struct aegis_state state;
+
+ ops->skcipher_walk_init(&walk, req, true);
+
+ kernel_fpu_begin();
+
+ crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+ crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
+ crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis128_aesni_enc,
+ .crypt_tail = crypto_aegis128_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis128_aesni_dec,
+ .crypt_tail = crypto_aegis128_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static struct aead_alg crypto_aegis128_aesni_alg = {
+ .setkey = crypto_aegis128_aesni_setkey,
+ .setauthsize = crypto_aegis128_aesni_setauthsize,
+ .encrypt = crypto_aegis128_aesni_encrypt,
+ .decrypt = crypto_aegis128_aesni_decrypt,
+ .init = crypto_aegis128_aesni_init_tfm,
+ .exit = crypto_aegis128_aesni_exit_tfm,
+
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+ .cra_priority = 400,
+
+ .cra_name = "__aegis128",
+ .cra_driver_name = "__aegis128-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static struct simd_aead_alg *simd_alg;
+
+static int __init crypto_aegis128_aesni_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
+ &simd_alg);
+}
+
+static void __exit crypto_aegis128_aesni_module_exit(void)
+{
+ simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg);
+}
+
+module_init(crypto_aegis128_aesni_module_init);
+module_exit(crypto_aegis128_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128");
+MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/marvell/linux/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/marvell/linux/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 0000000..77043a8
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,569 @@
+/*
+ * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
+ *
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
+ *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptograhpic library.
+ * Additional information on it can be found at:
+ * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+#define VMOVDQ vmovdqu
+
+#define xdata0 %xmm0
+#define xdata1 %xmm1
+#define xdata2 %xmm2
+#define xdata3 %xmm3
+#define xdata4 %xmm4
+#define xdata5 %xmm5
+#define xdata6 %xmm6
+#define xdata7 %xmm7
+#define xcounter %xmm8
+#define xbyteswap %xmm9
+#define xkey0 %xmm10
+#define xkey4 %xmm11
+#define xkey8 %xmm12
+#define xkey12 %xmm13
+#define xkeyA %xmm14
+#define xkeyB %xmm15
+
+#define p_in %rdi
+#define p_iv %rsi
+#define p_keys %rdx
+#define p_out %rcx
+#define num_bytes %r8
+
+#define tmp %r10
+#define DDQ_DATA 0
+#define XDATA 1
+#define KEY_128 1
+#define KEY_192 2
+#define KEY_256 3
+
+.section .rodata
+.align 16
+
+byteswap_const:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+ddq_low_msk:
+ .octa 0x0000000000000000FFFFFFFFFFFFFFFF
+ddq_high_add_1:
+ .octa 0x00000000000000010000000000000000
+ddq_add_1:
+ .octa 0x00000000000000000000000000000001
+ddq_add_2:
+ .octa 0x00000000000000000000000000000002
+ddq_add_3:
+ .octa 0x00000000000000000000000000000003
+ddq_add_4:
+ .octa 0x00000000000000000000000000000004
+ddq_add_5:
+ .octa 0x00000000000000000000000000000005
+ddq_add_6:
+ .octa 0x00000000000000000000000000000006
+ddq_add_7:
+ .octa 0x00000000000000000000000000000007
+ddq_add_8:
+ .octa 0x00000000000000000000000000000008
+
+.text
+
+/* generate a unique variable for ddq_add_x */
+
+/* generate a unique variable for xmm register */
+.macro setxdata n
+ var_xdata = %xmm\n
+.endm
+
+/* club the numeric 'id' to the symbol 'name' */
+
+.macro club name, id
+.altmacro
+ .if \name == XDATA
+ setxdata %\id
+ .endif
+.noaltmacro
+.endm
+
+/*
+ * do_aes num_in_par load_keys key_len
+ * This increments p_in, but not p_out
+ */
+.macro do_aes b, k, key_len
+ .set by, \b
+ .set load_keys, \k
+ .set klen, \key_len
+
+ .if (load_keys)
+ vmovdqa 0*16(p_keys), xkey0
+ .endif
+
+ vpshufb xbyteswap, xcounter, xdata0
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
+ vptest ddq_low_msk(%rip), var_xdata
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 1*16(p_keys), xkeyA
+
+ vpxor xkey0, xdata0, xdata0
+ vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
+ vptest ddq_low_msk(%rip), xcounter
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpxor xkey0, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 2*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 3*16(p_keys), xkey4
+ .endif
+ .else
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
+ .set i, (i +1)
+ .endr
+
+ add $(16*by), p_in
+
+ .if (klen == KEY_128)
+ vmovdqa 4*16(p_keys), xkeyB
+ .else
+ .if (load_keys)
+ vmovdqa 4*16(p_keys), xkey4
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 3 */
+ .if (klen == KEY_128)
+ vaesenc xkey4, var_xdata, var_xdata
+ .else
+ vaesenc xkeyA, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 5*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 4 */
+ .if (klen == KEY_128)
+ vaesenc xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkey4, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 6*16(p_keys), xkey8
+ .endif
+ .else
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 7*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 6 */
+ .if (klen == KEY_128)
+ vaesenc xkey8, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ vmovdqa 8*16(p_keys), xkeyB
+ .else
+ .if (load_keys)
+ vmovdqa 8*16(p_keys), xkey8
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+ .else
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 8 */
+ .if (klen == KEY_128)
+ vaesenc xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkey8, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 10*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 9 */
+ .if (klen == KEY_128)
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenc xkeyA, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ vmovdqa 11*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 10 */
+ .if (klen == KEY_128)
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ .if (load_keys)
+ vmovdqa 12*16(p_keys), xkey12
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 13*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ .if (klen == KEY_256)
+ /* key 12 */
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenclast xkey12, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 14*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 13 */
+ vaesenc xkeyA, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 14 */
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
+ .endif
+
+ .set i, 0
+ .rept (by / 2)
+ .set j, (i+1)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ VMOVDQ (j*16 - 16*by)(p_in), xkeyB
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ club XDATA, j
+ vpxor xkeyB, var_xdata, var_xdata
+ .set i, (i+2)
+ .endr
+
+ .if (i < by)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ VMOVDQ var_xdata, i*16(p_out)
+ .set i, (i+1)
+ .endr
+.endm
+
+.macro do_aes_load val, key_len
+ do_aes \val, 1, \key_len
+.endm
+
+.macro do_aes_noload val, key_len
+ do_aes \val, 0, \key_len
+.endm
+
+/* main body of aes ctr load */
+
+.macro do_aes_ctrmain key_len
+ cmp $16, num_bytes
+ jb .Ldo_return2\key_len
+
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+
+ mov num_bytes, tmp
+ and $(7*16), tmp
+ jz .Lmult_of_8_blks\key_len
+
+ /* 1 <= tmp <= 7 */
+ cmp $(4*16), tmp
+ jg .Lgt4\key_len
+ je .Leq4\key_len
+
+.Llt4\key_len:
+ cmp $(2*16), tmp
+ jg .Leq3\key_len
+ je .Leq2\key_len
+
+.Leq1\key_len:
+ do_aes_load 1, \key_len
+ add $(1*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq2\key_len:
+ do_aes_load 2, \key_len
+ add $(2*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+
+.Leq3\key_len:
+ do_aes_load 3, \key_len
+ add $(3*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq4\key_len:
+ do_aes_load 4, \key_len
+ add $(4*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lgt4\key_len:
+ cmp $(6*16), tmp
+ jg .Leq7\key_len
+ je .Leq6\key_len
+
+.Leq5\key_len:
+ do_aes_load 5, \key_len
+ add $(5*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq6\key_len:
+ do_aes_load 6, \key_len
+ add $(6*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq7\key_len:
+ do_aes_load 7, \key_len
+ add $(7*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lmult_of_8_blks\key_len:
+ .if (\key_len != KEY_128)
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 4*16(p_keys), xkey4
+ vmovdqa 8*16(p_keys), xkey8
+ vmovdqa 12*16(p_keys), xkey12
+ .else
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 3*16(p_keys), xkey4
+ vmovdqa 6*16(p_keys), xkey8
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+.align 16
+.Lmain_loop2\key_len:
+ /* num_bytes is a multiple of 8 and >0 */
+ do_aes_noload 8, \key_len
+ add $(8*16), p_out
+ sub $(8*16), num_bytes
+ jne .Lmain_loop2\key_len
+
+.Ldo_return2\key_len:
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ ret
+.endm
+
+/*
+ * routine to do AES128 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128
+
+ENDPROC(aes_ctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192
+
+ENDPROC(aes_ctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256
+
+ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/marvell/linux/arch/x86/crypto/aes_glue.c b/marvell/linux/arch/x86/crypto/aes_glue.c
new file mode 100644
index 0000000..7b7dc05
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aes_glue.c
@@ -0,0 +1 @@
+// SPDX-License-Identifier: GPL-2.0-only
diff --git a/marvell/linux/arch/x86/crypto/aesni-intel_asm.S b/marvell/linux/arch/x86/crypto/aesni-intel_asm.S
new file mode 100644
index 0000000..dd954d8
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aesni-intel_asm.S
@@ -0,0 +1,2863 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Kahraman Akdemir
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Adrian Hoban <adrian.hoban@intel.com>
+ * James Guilford (james.guilford@intel.com)
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Wajdi Feghali (wajdi.k.feghali@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * The following macros are used to move an (un)aligned 16 byte value to/from
+ * an XMM register. This can done for either FP or integer values, for FP use
+ * movaps (move aligned packed single) or integer use movdqa (move double quad
+ * aligned). It doesn't make a performance difference which instruction is used
+ * since Nehalem (original Core i7) was released. However, the movaps is a byte
+ * shorter, so that is the one we'll use for now. (same for unaligned).
+ */
+#define MOVADQ movaps
+#define MOVUDQ movups
+
+#ifdef __x86_64__
+
+# constants in mergeable sections, linker can reorder and merge
+.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
+.align 16
+.Lgf128mul_x_ble_mask:
+ .octa 0x00000000000000010000000000000087
+.section .rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
+POLY: .octa 0xC2000000000000000000000000000001
+.section .rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
+TWOONE: .octa 0x00000001000000000000000000000001
+
+.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+.section .rodata.cst16.MASK1, "aM", @progbits, 16
+.align 16
+MASK1: .octa 0x0000000000000000ffffffffffffffff
+.section .rodata.cst16.MASK2, "aM", @progbits, 16
+.align 16
+MASK2: .octa 0xffffffffffffffff0000000000000000
+.section .rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE: .octa 0x00000000000000000000000000000001
+.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
+.align 16
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+.section .rodata.cst16.dec, "aM", @progbits, 16
+.align 16
+dec: .octa 0x1
+.section .rodata.cst16.enc, "aM", @progbits, 16
+.align 16
+enc: .octa 0x2
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and zero should follow ALL_F
+.section .rodata, "a", @progbits
+.align 16
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0x00000000000000000000000000000000
+
+.text
+
+
+#define STACK_OFFSET 8*3
+
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+#define HashKey 16*6 // store HashKey <<1 mod poly here
+#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
+#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
+#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
+#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
+ // bits of HashKey <<1 mod poly here
+ //(for Karatsuba purposes)
+#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^2 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^3 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^4 <<1 mod poly here
+ // (for Karatsuba purposes)
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%rsp)
+#define arg8 STACK_OFFSET+16(%rsp)
+#define arg9 STACK_OFFSET+24(%rsp)
+#define arg10 STACK_OFFSET+32(%rsp)
+#define arg11 STACK_OFFSET+40(%rsp)
+#define keysize 2*15*16(%arg1)
+#endif
+
+
+#define STATE1 %xmm0
+#define STATE2 %xmm4
+#define STATE3 %xmm5
+#define STATE4 %xmm6
+#define STATE STATE1
+#define IN1 %xmm1
+#define IN2 %xmm7
+#define IN3 %xmm8
+#define IN4 %xmm9
+#define IN IN1
+#define KEY %xmm2
+#define IV %xmm3
+
+#define BSWAP_MASK %xmm10
+#define CTR %xmm11
+#define INC %xmm12
+
+#define GF128MUL_MASK %xmm10
+
+#ifdef __x86_64__
+#define AREG %rax
+#define KEYP %rdi
+#define OUTP %rsi
+#define UKEYP OUTP
+#define INP %rdx
+#define LEN %rcx
+#define IVP %r8
+#define KLEN %r9d
+#define T1 %r10
+#define TKEYP T1
+#define T2 %r11
+#define TCTR_LOW T2
+#else
+#define AREG %eax
+#define KEYP %edi
+#define OUTP AREG
+#define UKEYP OUTP
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define KLEN %ebx
+#define T1 %ecx
+#define TKEYP T1
+#endif
+
+.macro FUNC_SAVE
+ push %r12
+ push %r13
+ push %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+.endm
+
+
+.macro FUNC_RESTORE
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+# Precompute hashkeys.
+# Input: Hash subkey.
+# Output: HashKeys stored in gcm_context_data. Only needs to be called
+# once per key.
+# clobbers r12, and tmp xmm registers.
+.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
+ mov \SUBKEY, %r12
+ movdqu (%r12), \TMP3
+ movdqa SHUF_MASK(%rip), \TMP2
+ PSHUFB_XMM \TMP2, \TMP3
+
+ # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+ movdqa \TMP3, \TMP2
+ psllq $1, \TMP3
+ psrlq $63, \TMP2
+ movdqa \TMP2, \TMP1
+ pslldq $8, \TMP2
+ psrldq $8, \TMP1
+ por \TMP2, \TMP3
+
+ # reduce HashKey<<1
+
+ pshufd $0x24, \TMP1, \TMP2
+ pcmpeqd TWOONE(%rip), \TMP2
+ pand POLY(%rip), \TMP2
+ pxor \TMP2, \TMP3
+ movdqu \TMP3, HashKey(%arg2)
+
+ movdqa \TMP3, \TMP5
+ pshufd $78, \TMP3, \TMP1
+ pxor \TMP3, \TMP1
+ movdqu \TMP1, HashKey_k(%arg2)
+
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+ movdqu \TMP5, HashKey_2(%arg2)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqu \TMP1, HashKey_2_k(%arg2)
+
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqu \TMP5, HashKey_3(%arg2)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqu \TMP1, HashKey_3_k(%arg2)
+
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqu \TMP5, HashKey_4(%arg2)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqu \TMP1, HashKey_4_k(%arg2)
+.endm
+
+# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
+# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
+.macro GCM_INIT Iv SUBKEY AAD AADLEN
+ mov \AADLEN, %r11
+ mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
+ xor %r11d, %r11d
+ mov %r11, InLen(%arg2) # ctx_data.in_length = 0
+ mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
+ mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
+ mov \Iv, %rax
+ movdqu (%rax), %xmm0
+ movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
+
+ movdqa SHUF_MASK(%rip), %xmm2
+ PSHUFB_XMM %xmm2, %xmm0
+ movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
+
+ PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
+ movdqu HashKey(%arg2), %xmm13
+
+ CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
+ %xmm4, %xmm5, %xmm6
+.endm
+
+# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
+# struct has been initialized by GCM_INIT.
+# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
+# Clobbers rax, r10-r13, and xmm0-xmm15
+.macro GCM_ENC_DEC operation
+ movdqu AadHash(%arg2), %xmm8
+ movdqu HashKey(%arg2), %xmm13
+ add %arg5, InLen(%arg2)
+
+ xor %r11d, %r11d # initialise the data pointer offset as zero
+ PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
+
+ sub %r11, %arg5 # sub partial block data used
+ mov %arg5, %r13 # save the number of bytes
+
+ and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
+ mov %r13, %r12
+ # Encrypt/Decrypt first few blocks
+
+ and $(3<<4), %r12
+ jz _initial_num_blocks_is_0_\@
+ cmp $(2<<4), %r12
+ jb _initial_num_blocks_is_1_\@
+ je _initial_num_blocks_is_2_\@
+_initial_num_blocks_is_3_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
+ sub $48, %r13
+ jmp _initial_blocks_\@
+_initial_num_blocks_is_2_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
+ sub $32, %r13
+ jmp _initial_blocks_\@
+_initial_num_blocks_is_1_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
+ sub $16, %r13
+ jmp _initial_blocks_\@
+_initial_num_blocks_is_0_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
+_initial_blocks_\@:
+
+ # Main loop - Encrypt/Decrypt remaining blocks
+
+ test %r13, %r13
+ je _zero_cipher_left_\@
+ sub $64, %r13
+ je _four_cipher_left_\@
+_crypt_by_4_\@:
+ GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
+ %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
+ %xmm7, %xmm8, enc
+ add $64, %r11
+ sub $64, %r13
+ jne _crypt_by_4_\@
+_four_cipher_left_\@:
+ GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_\@:
+ movdqu %xmm8, AadHash(%arg2)
+ movdqu %xmm0, CurCount(%arg2)
+
+ mov %arg5, %r13
+ and $15, %r13 # %r13 = arg5 (mod 16)
+ je _multiple_of_16_bytes_\@
+
+ mov %r13, PBlockLen(%arg2)
+
+ # Handle the last <16 Byte block separately
+ paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
+ movdqu %xmm0, CurCount(%arg2)
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm0
+
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
+ movdqu %xmm0, PBlockEncKey(%arg2)
+
+ cmp $16, %arg5
+ jge _large_enough_update_\@
+
+ lea (%arg4,%r11,1), %r10
+ mov %r13, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+ jmp _data_read_\@
+
+_large_enough_update_\@:
+ sub $16, %r11
+ add %r13, %r11
+
+ # receive the last <16 Byte block
+ movdqu (%arg4, %r11, 1), %xmm1
+
+ sub %r13, %r11
+ add $16, %r11
+
+ lea SHIFT_MASK+16(%rip), %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ sub %r13, %r12
+ # get the appropriate shuffle mask
+ movdqu (%r12), %xmm2
+ # shift right 16-r13 bytes
+ PSHUFB_XMM %xmm2, %xmm1
+
+_data_read_\@:
+ lea ALL_F+16(%rip), %r12
+ sub %r13, %r12
+
+.ifc \operation, dec
+ movdqa %xmm1, %xmm2
+.endif
+ pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
+ movdqu (%r12), %xmm1
+ # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+ pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
+.ifc \operation, dec
+ pand %xmm1, %xmm2
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10 ,%xmm2
+
+ pxor %xmm2, %xmm8
+.else
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10,%xmm0
+
+ pxor %xmm0, %xmm8
+.endif
+
+ movdqu %xmm8, AadHash(%arg2)
+.ifc \operation, enc
+ # GHASH computation for the last <16 byte block
+ movdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm0 back to output as ciphertext
+ PSHUFB_XMM %xmm10, %xmm0
+.endif
+
+ # Output %r13 bytes
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+ mov %rax, (%arg3 , %r11, 1)
+ add $8, %r11
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ mov %al, (%arg3, %r11, 1)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_multiple_of_16_bytes_\@:
+.endm
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
+ movdqu AadHash(%arg2), %xmm8
+ movdqu HashKey(%arg2), %xmm13
+
+ mov PBlockLen(%arg2), %r12
+
+ test %r12, %r12
+ je _partial_done\@
+
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
+ mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ movd %r12d, %xmm15 # len(A) in %xmm15
+ mov InLen(%arg2), %r12
+ shl $3, %r12 # len(C) in bits (*128)
+ MOVQ_R64_XMM %r12, %xmm1
+
+ pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
+ pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+ pxor %xmm15, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # final GHASH computation
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm8
+
+ movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
+ pxor %xmm8, %xmm0
+_return_T_\@:
+ mov \AUTHTAG, %r10 # %r10 = authTag
+ mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
+ cmp $16, %r11
+ je _T_16_\@
+ cmp $8, %r11
+ jl _T_4_\@
+_T_8_\@:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ psrldq $8, %xmm0
+ test %r11, %r11
+ je _return_T_done_\@
+_T_4_\@:
+ movd %xmm0, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ psrldq $4, %xmm0
+ test %r11, %r11
+ je _return_T_done_\@
+_T_123_\@:
+ movd %xmm0, %eax
+ cmp $2, %r11
+ jl _T_1_\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done_\@
+ add $2, %r10
+ sar $16, %eax
+_T_1_\@:
+ mov %al, (%r10)
+ jmp _return_T_done_\@
+_T_16_\@:
+ movdqu %xmm0, (%r10)
+_return_T_done_\@:
+.endm
+
+#ifdef __x86_64__
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+ movdqa \GH, \TMP1
+ pshufd $78, \GH, \TMP2
+ pshufd $78, \HK, \TMP3
+ pxor \GH, \TMP2 # TMP2 = a1+a0
+ pxor \HK, \TMP3 # TMP3 = b1+b0
+ PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
+ PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
+ pxor \GH, \TMP2
+ pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \GH
+ pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
+
+ # first phase of the reduction
+
+ movdqa \GH, \TMP2
+ movdqa \GH, \TMP3
+ movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ pslld $31, \TMP2 # packed right shift <<31
+ pslld $30, \TMP3 # packed right shift <<30
+ pslld $25, \TMP4 # packed right shift <<25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift TMP5 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \GH
+
+ # second phase of the reduction
+
+ movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ movdqa \GH,\TMP3
+ movdqa \GH,\TMP4
+ psrld $1,\TMP2 # packed left shift >>1
+ psrld $2,\TMP3 # packed left shift >>2
+ psrld $7,\TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \GH
+ pxor \TMP1, \GH # result is in TMP1
+.endm
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN and XMM1
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ MOVQ_R64_XMM %rax, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ MOVQ_R64_XMM %rax, \XMM1
+ pslldq $8, \XMM1
+ por \XMM1, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ MOVQ_R64_XMM %rax, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
+# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+# clobbers r10-11, xmm14
+.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
+ TMP6 TMP7
+ MOVADQ SHUF_MASK(%rip), %xmm14
+ mov \AAD, %r10 # %r10 = AAD
+ mov \AADLEN, %r11 # %r11 = aadLen
+ pxor \TMP7, \TMP7
+ pxor \TMP6, \TMP6
+
+ cmp $16, %r11
+ jl _get_AAD_rest\@
+_get_AAD_blocks\@:
+ movdqu (%r10), \TMP7
+ PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pxor \TMP7, \TMP6
+ GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+ add $16, %r10
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+
+ movdqu \TMP6, \TMP7
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\@:
+ test %r11, %r11
+ je _get_AAD_done\@
+
+ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
+ PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pxor \TMP6, \TMP7
+ GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+ movdqu \TMP7, \TMP6
+
+_get_AAD_done\@:
+ movdqu \TMP6, AadHash(%arg2)
+.endm
+
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+ AAD_HASH operation
+ mov PBlockLen(%arg2), %r13
+ test %r13, %r13
+ je _partial_block_done_\@ # Leave Macro if no partial blocks
+ # Read in input data without over reading
+ cmp $16, \PLAIN_CYPH_LEN
+ jl _fewer_than_16_bytes_\@
+ movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+ jmp _data_read_\@
+
+_fewer_than_16_bytes_\@:
+ lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+ mov \PLAIN_CYPH_LEN, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
+
+ mov PBlockLen(%arg2), %r13
+
+_data_read_\@: # Finished reading in data
+
+ movdqu PBlockEncKey(%arg2), %xmm9
+ movdqu HashKey(%arg2), %xmm13
+
+ lea SHIFT_MASK(%rip), %r12
+
+ # adjust the shuffle mask pointer to be able to shift r13 bytes
+ # r16-r13 is the number of bytes in plaintext mod 16)
+ add %r13, %r12
+ movdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
+
+.ifc \operation, dec
+ movdqa %xmm1, %xmm3
+ pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_1_\@
+ sub %r10, %r12
+_no_extra_mask_1_\@:
+
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
+
+ pand %xmm1, %xmm3
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm3
+ PSHUFB_XMM %xmm2, %xmm3
+ pxor %xmm3, \AAD_HASH
+
+ test %r10, %r10
+ jl _partial_incomplete_1_\@
+
+ # GHASH computation for the last <16 Byte block
+ GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax, %eax
+
+ mov %rax, PBlockLen(%arg2)
+ jmp _dec_done_\@
+_partial_incomplete_1_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_dec_done_\@:
+ movdqu \AAD_HASH, AadHash(%arg2)
+.else
+ pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_2_\@
+ sub %r10, %r12
+_no_extra_mask_2_\@:
+
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand %xmm1, %xmm9
+
+ movdqa SHUF_MASK(%rip), %xmm1
+ PSHUFB_XMM %xmm1, %xmm9
+ PSHUFB_XMM %xmm2, %xmm9
+ pxor %xmm9, \AAD_HASH
+
+ test %r10, %r10
+ jl _partial_incomplete_2_\@
+
+ # GHASH computation for the last <16 Byte block
+ GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax, %eax
+
+ mov %rax, PBlockLen(%arg2)
+ jmp _encode_done_\@
+_partial_incomplete_2_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_encode_done_\@:
+ movdqu \AAD_HASH, AadHash(%arg2)
+
+ movdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm9 back to output as ciphertext
+ PSHUFB_XMM %xmm10, %xmm9
+ PSHUFB_XMM %xmm2, %xmm9
+.endif
+ # output encrypted Bytes
+ test %r10, %r10
+ jl _partial_fill_\@
+ mov %r13, %r12
+ mov $16, %r13
+ # Set r13 to be the number of bytes to write out
+ sub %r12, %r13
+ jmp _count_set_\@
+_partial_fill_\@:
+ mov \PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+ movdqa %xmm9, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+
+ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $8, \DATA_OFFSET
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $1, \DATA_OFFSET
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ MOVADQ SHUF_MASK(%rip), %xmm14
+
+ movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
+
+ # start AES for num_initial_blocks blocks
+
+ movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+
+ MOVADQ ONE(%RIP),\TMP1
+ MOVADQ 0(%arg1),\TMP2
+.irpc index, \i_seq
+ paddd \TMP1, \XMM0 # INCR Y0
+.ifc \operation, dec
+ movdqa \XMM0, %xmm\index
+.else
+ MOVADQ \XMM0, %xmm\index
+.endif
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+ pxor \TMP2, %xmm\index
+.endr
+ lea 0x10(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+
+aes_loop_initial_\@:
+ MOVADQ (%r10),\TMP1
+.irpc index, \i_seq
+ AESENC \TMP1, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_initial_\@
+
+ MOVADQ (%r10), \TMP1
+.irpc index, \i_seq
+ AESENCLAST \TMP1, %xmm\index # Last Round
+.endr
+.irpc index, \i_seq
+ movdqu (%arg4 , %r11, 1), \TMP1
+ pxor \TMP1, %xmm\index
+ movdqu %xmm\index, (%arg3 , %r11, 1)
+ # write back plaintext/ciphertext for num_initial_blocks
+ add $16, %r11
+
+.ifc \operation, dec
+ movdqa \TMP1, %xmm\index
+.endif
+ PSHUFB_XMM %xmm14, %xmm\index
+
+ # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+
+ # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+ pxor %xmm5, %xmm6
+ GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+ cmp $64, %r13
+ jl _initial_blocks_done\@
+ # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+ MOVADQ ONE(%RIP),\TMP1
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM1
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM2
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM3
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM4
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+ MOVADQ 0(%arg1),\TMP1
+ pxor \TMP1, \XMM1
+ pxor \TMP1, \XMM2
+ pxor \TMP1, \XMM3
+ pxor \TMP1, \XMM4
+.irpc index, 1234 # do 4 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+.irpc index, 56789 # do next 5 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_pre_done\@
+
+aes_loop_pre_\@:
+ MOVADQ (%r10),\TMP2
+.irpc index, 1234
+ AESENC \TMP2, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_pre_\@
+
+aes_loop_pre_done\@:
+ MOVADQ (%r10), \TMP2
+ AESENCLAST \TMP2, \XMM1
+ AESENCLAST \TMP2, \XMM2
+ AESENCLAST \TMP2, \XMM3
+ AESENCLAST \TMP2, \XMM4
+ movdqu 16*0(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM1
+.ifc \operation, dec
+ movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM1
+.endif
+ movdqu 16*1(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM2
+.ifc \operation, dec
+ movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM2
+.endif
+ movdqu 16*2(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM3
+.ifc \operation, dec
+ movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM3
+.endif
+ movdqu 16*3(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM4
+.ifc \operation, dec
+ movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM4
+.else
+ movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
+ movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
+ movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
+ movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
+.endif
+
+ add $64, %r11
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pxor \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\@:
+
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg3, %arg4 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqu HashKey_4(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqu HashKey_4_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqu HashKey_3(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_3_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqu HashKey_2(%arg2), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_2_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqu HashKey(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_par_enc_done\@
+
+aes_loop_par_enc\@:
+ MOVADQ (%r10),\TMP3
+.irpc index, 1234
+ AESENC \TMP3, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_par_enc\@
+
+aes_loop_par_enc_done\@:
+ MOVADQ (%r10), \TMP3
+ AESENCLAST \TMP3, \XMM1 # Round 10
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqu HashKey_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu 16(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu 32(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu 48(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg3, %arg4 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqu HashKey_4(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqu HashKey_4_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqu HashKey_3(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_3_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqu HashKey_2(%arg2), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_2_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqu HashKey(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_par_dec_done\@
+
+aes_loop_par_dec\@:
+ MOVADQ (%r10),\TMP3
+.irpc index, 1234
+ AESENC \TMP3, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_par_dec\@
+
+aes_loop_par_dec_done\@:
+ MOVADQ (%r10), \TMP3
+ AESENCLAST \TMP3, \XMM1 # last round
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqu HashKey_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM1
+ movdqu 16(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM2
+ movdqu 32(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM3
+ movdqu 48(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+ # Multiply TMP6 * HashKey (using Karatsuba)
+
+ movdqa \XMM1, \TMP6
+ pshufd $78, \XMM1, \TMP2
+ pxor \XMM1, \TMP2
+ movdqu HashKey_4(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
+ movdqu HashKey_4_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqa \XMM1, \XMMDst
+ movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM2, \TMP1
+ pshufd $78, \XMM2, \TMP2
+ pxor \XMM2, \TMP2
+ movdqu HashKey_3(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
+ movdqu HashKey_3_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM2, \XMMDst
+ pxor \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM3, \TMP1
+ pshufd $78, \XMM3, \TMP2
+ pxor \XMM3, \TMP2
+ movdqu HashKey_2(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
+ movdqu HashKey_2_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM3, \XMMDst
+ pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+ movdqa \XMM4, \TMP1
+ pshufd $78, \XMM4, \TMP2
+ pxor \XMM4, \TMP2
+ movdqu HashKey(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
+ movdqu HashKey_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM4, \XMMDst
+ pxor \XMM1, \TMP2
+ pxor \TMP6, \TMP2
+ pxor \XMMDst, \TMP2
+ # middle section of the temp results combined as in karatsuba algorithm
+ movdqa \TMP2, \TMP4
+ pslldq $8, \TMP4 # left shift TMP4 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP4, \XMMDst
+ pxor \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+ # first phase of the reduction
+ movdqa \XMMDst, \TMP2
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+ pslld $31, \TMP2 # packed right shifting << 31
+ pslld $30, \TMP3 # packed right shifting << 30
+ pslld $25, \TMP4 # packed right shifting << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP7
+ psrldq $4, \TMP7 # right shift TMP7 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \XMMDst
+
+ # second phase of the reduction
+ movdqa \XMMDst, \TMP2
+ # make 3 copies of XMMDst for doing 3 shift operations
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+ psrld $1, \TMP2 # packed left shift >> 1
+ psrld $2, \TMP3 # packed left shift >> 2
+ psrld $7, \TMP4 # packed left shift >> 7
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ pxor \TMP7, \TMP2
+ pxor \TMP2, \XMMDst
+ pxor \TMP6, \XMMDst # reduced result is in XMMDst
+.endm
+
+
+/* Encryption of a single block
+* uses eax & r10
+*/
+
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+ pxor (%arg1), \XMM0
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+ lea 16(%arg1), %r10 # get first expanded key address
+
+_esb_loop_\@:
+ MOVADQ (%r10),\TMP1
+ AESENC \TMP1,\XMM0
+ add $16,%r10
+ sub $1,%eax
+ jnz _esb_loop_\@
+
+ MOVADQ (%r10),\TMP1
+ AESENCLAST \TMP1,\XMM0
+.endm
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data
+* // Context data
+* u8 *out, // Plaintext output. Encrypt in-place is allowed.
+* const u8 *in, // Ciphertext input
+* u64 plaintext_len, // Length of data in bytes for decryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
+* // given authentication tag and only return the plaintext if they match.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+* // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the first
+* set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+ENTRY(aesni_gcm_dec)
+ FUNC_SAVE
+
+ GCM_INIT %arg6, arg7, arg8, arg9
+ GCM_ENC_DEC dec
+ GCM_COMPLETE arg10, arg11
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec)
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data
+* // Context data
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+* // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the
+* first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+ FUNC_SAVE
+
+ GCM_INIT %arg6, arg7, arg8, arg9
+ GCM_ENC_DEC enc
+
+ GCM_COMPLETE arg10, arg11
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc)
+
+/*****************************************************************************
+* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len) // Length of AAD in bytes.
+*/
+ENTRY(aesni_gcm_init)
+ FUNC_SAVE
+ GCM_INIT %arg3, %arg4,%arg5, %arg6
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_init)
+
+/*****************************************************************************
+* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+*/
+ENTRY(aesni_gcm_enc_update)
+ FUNC_SAVE
+ GCM_ENC_DEC enc
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update)
+
+/*****************************************************************************
+* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+*/
+ENTRY(aesni_gcm_dec_update)
+ FUNC_SAVE
+ GCM_ENC_DEC dec
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update)
+
+/*****************************************************************************
+* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *auth_tag, // Authenticated Tag output.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+* // 12 or 8.
+*/
+ENTRY(aesni_gcm_finalize)
+ FUNC_SAVE
+ GCM_COMPLETE %arg3 %arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize)
+
+#endif
+
+
+.align 4
+_key_expansion_128:
+_key_expansion_256a:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
+ ret
+ENDPROC(_key_expansion_128)
+ENDPROC(_key_expansion_256a)
+
+.align 4
+_key_expansion_192a:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ movaps %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movaps %xmm6, (TKEYP)
+ shufps $0b01001110, %xmm2, %xmm1
+ movaps %xmm1, 0x10(TKEYP)
+ add $0x20, TKEYP
+ ret
+ENDPROC(_key_expansion_192a)
+
+.align 4
+_key_expansion_192b:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
+ ret
+ENDPROC(_key_expansion_192b)
+
+.align 4
+_key_expansion_256b:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
+ ret
+ENDPROC(_key_expansion_256b)
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
+ movl (FRAME_OFFSET+16)(%esp), %edx # key_len
+#endif
+ movups (UKEYP), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (KEYP)
+ lea 0x10(KEYP), TKEYP # key addr
+ movl %edx, 480(KEYP)
+ pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
+ cmp $24, %dl
+ jb .Lenc_key128
+ je .Lenc_key192
+ movups 0x10(UKEYP), %xmm2 # other user key
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_256a
+ jmp .Ldec_key
+.Lenc_key192:
+ movq 0x10(UKEYP), %xmm2 # other user key
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
+ call _key_expansion_192b
+ jmp .Ldec_key
+.Lenc_key128:
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
+ call _key_expansion_128
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
+ call _key_expansion_128
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
+ call _key_expansion_128
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
+ call _key_expansion_128
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
+ call _key_expansion_128
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
+ call _key_expansion_128
+ AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
+ call _key_expansion_128
+ AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
+ call _key_expansion_128
+ AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
+ call _key_expansion_128
+ AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
+ call _key_expansion_128
+.Ldec_key:
+ sub $0x10, TKEYP
+ movaps (KEYP), %xmm0
+ movaps (TKEYP), %xmm1
+ movaps %xmm0, 240(TKEYP)
+ movaps %xmm1, 240(KEYP)
+ add $0x10, KEYP
+ lea 240-16(TKEYP), UKEYP
+.align 4
+.Ldec_key_loop:
+ movaps (KEYP), %xmm0
+ AESIMC %xmm0 %xmm1
+ movaps %xmm1, (UKEYP)
+ add $0x10, KEYP
+ sub $0x10, UKEYP
+ cmp TKEYP, KEYP
+ jb .Ldec_key_loop
+ xor AREG, AREG
+#ifndef __x86_64__
+ popl KEYP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_set_key)
+
+/*
+ * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
+#endif
+ movl 480(KEYP), KLEN # key length
+ movups (INP), STATE # input
+ call _aesni_enc1
+ movups STATE, (OUTP) # output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_enc)
+
+/*
+ * _aesni_enc1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_enc1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Lenc128
+ lea 0x20(TKEYP), TKEYP
+ je .Lenc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps (TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE
+ ret
+ENDPROC(_aesni_enc1)
+
+/*
+ * _aesni_enc4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_enc4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4enc128
+ lea 0x20(TKEYP), TKEYP
+ je .L4enc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps (TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE1 # last round
+ AESENCLAST KEY STATE2
+ AESENCLAST KEY STATE3
+ AESENCLAST KEY STATE4
+ ret
+ENDPROC(_aesni_enc4)
+
+/*
+ * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
+#endif
+ mov 480(KEYP), KLEN # key length
+ add $240, KEYP
+ movups (INP), STATE # input
+ call _aesni_dec1
+ movups STATE, (OUTP) #output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_dec)
+
+/*
+ * _aesni_dec1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_dec1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Ldec128
+ lea 0x20(TKEYP), TKEYP
+ je .Ldec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE
+ ret
+ENDPROC(_aesni_dec1)
+
+/*
+ * _aesni_dec4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_dec4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4dec128
+ lea 0x20(TKEYP), TKEYP
+ je .L4dec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE1 # last round
+ AESDECLAST KEY STATE2
+ AESDECLAST KEY STATE3
+ AESDECLAST KEY STATE4
+ ret
+ENDPROC(_aesni_dec4)
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
+#endif
+ test LEN, LEN # check length
+ jz .Lecb_enc_ret
+ mov 480(KEYP), KLEN
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+ cmp $64, LEN
+ jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_enc4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_enc_loop4
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+ movups (INP), STATE1
+ call _aesni_enc1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_ecb_enc)
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
+#endif
+ test LEN, LEN
+ jz .Lecb_dec_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+ cmp $64, LEN
+ jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_dec4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_dec_loop4
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+ movups (INP), STATE1
+ call _aesni_dec1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_ecb_dec)
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+#endif
+ cmp $16, LEN
+ jb .Lcbc_enc_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE # load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+ movups (INP), IN # load input
+ pxor IN, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP) # store output
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_enc_loop
+ movups STATE, (IVP)
+.Lcbc_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_cbc_enc)
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+#endif
+ cmp $16, LEN
+ jb .Lcbc_dec_just_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ cmp $64, LEN
+ jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+ movups (INP), IN1
+ movaps IN1, STATE1
+ movups 0x10(INP), IN2
+ movaps IN2, STATE2
+#ifdef __x86_64__
+ movups 0x20(INP), IN3
+ movaps IN3, STATE3
+ movups 0x30(INP), IN4
+ movaps IN4, STATE4
+#else
+ movups 0x20(INP), IN1
+ movaps IN1, STATE3
+ movups 0x30(INP), IN2
+ movaps IN2, STATE4
+#endif
+ call _aesni_dec4
+ pxor IV, STATE1
+#ifdef __x86_64__
+ pxor IN1, STATE2
+ pxor IN2, STATE3
+ pxor IN3, STATE4
+ movaps IN4, IV
+#else
+ pxor IN1, STATE4
+ movaps IN2, IV
+ movups (INP), IN1
+ pxor IN1, STATE2
+ movups 0x10(INP), IN2
+ pxor IN2, STATE3
+#endif
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lcbc_dec_loop4
+ cmp $16, LEN
+ jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+ movups (INP), IN
+ movaps IN, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ movups STATE, (OUTP)
+ movaps IN, IV
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_dec_loop1
+.Lcbc_dec_ret:
+ movups IV, (IVP)
+.Lcbc_dec_just_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_cbc_dec)
+
+#ifdef __x86_64__
+.pushsection .rodata
+.align 16
+.Lbswap_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.popsection
+
+/*
+ * _aesni_inc_init: internal ABI
+ * setup registers used by _aesni_inc
+ * input:
+ * IV
+ * output:
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ */
+.align 4
+_aesni_inc_init:
+ movaps .Lbswap_mask, BSWAP_MASK
+ movaps IV, CTR
+ PSHUFB_XMM BSWAP_MASK CTR
+ mov $1, TCTR_LOW
+ MOVQ_R64_XMM TCTR_LOW INC
+ MOVQ_R64_XMM CTR TCTR_LOW
+ ret
+ENDPROC(_aesni_inc_init)
+
+/*
+ * _aesni_inc: internal ABI
+ * Increase IV by 1, IV is in big endian
+ * input:
+ * IV
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ * output:
+ * IV: Increase by 1
+ * changed:
+ * CTR: == output IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ */
+.align 4
+_aesni_inc:
+ paddq INC, CTR
+ add $1, TCTR_LOW
+ jnc .Linc_low
+ pslldq $8, INC
+ paddq INC, CTR
+ psrldq $8, INC
+.Linc_low:
+ movaps CTR, IV
+ PSHUFB_XMM BSWAP_MASK IV
+ ret
+ENDPROC(_aesni_inc)
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+ FRAME_BEGIN
+ cmp $16, LEN
+ jb .Lctr_enc_just_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), IV
+ call _aesni_inc_init
+ cmp $64, LEN
+ jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+ movaps IV, STATE1
+ call _aesni_inc
+ movups (INP), IN1
+ movaps IV, STATE2
+ call _aesni_inc
+ movups 0x10(INP), IN2
+ movaps IV, STATE3
+ call _aesni_inc
+ movups 0x20(INP), IN3
+ movaps IV, STATE4
+ call _aesni_inc
+ movups 0x30(INP), IN4
+ call _aesni_enc4
+ pxor IN1, STATE1
+ movups STATE1, (OUTP)
+ pxor IN2, STATE2
+ movups STATE2, 0x10(OUTP)
+ pxor IN3, STATE3
+ movups STATE3, 0x20(OUTP)
+ pxor IN4, STATE4
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lctr_enc_loop4
+ cmp $16, LEN
+ jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+ movaps IV, STATE
+ call _aesni_inc
+ movups (INP), IN
+ call _aesni_enc1
+ pxor IN, STATE
+ movups STATE, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+ movups IV, (IVP)
+.Lctr_enc_just_ret:
+ FRAME_END
+ ret
+ENDPROC(aesni_ctr_enc)
+
+/*
+ * _aesni_gf128mul_x_ble: internal ABI
+ * Multiply in GF(2^128) for XTS IVs
+ * input:
+ * IV: current IV
+ * GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ * IV: next IV
+ * changed:
+ * CTR: == temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+ pshufd $0x13, IV, CTR; \
+ paddq IV, IV; \
+ psrad $31, CTR; \
+ pand GF128MUL_MASK, CTR; \
+ pxor CTR, IV;
+
+/*
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+ENTRY(aesni_xts_encrypt)
+ FRAME_BEGIN
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+
+.Lxts_enc_loop4:
+ movdqa IV, STATE1
+ movdqu 0x00(INP), INC
+ pxor INC, STATE1
+ movdqu IV, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ movdqu 0x10(INP), INC
+ pxor INC, STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), INC
+ pxor INC, STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ movdqu 0x30(INP), INC
+ pxor INC, STATE4
+ movdqu IV, 0x30(OUTP)
+
+ call _aesni_enc4
+
+ movdqu 0x00(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), INC
+ pxor INC, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ movdqu 0x20(OUTP), INC
+ pxor INC, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ movdqu 0x30(OUTP), INC
+ pxor INC, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+
+ add $64, INP
+ add $64, OUTP
+ sub $64, LEN
+ ja .Lxts_enc_loop4
+
+ movups IV, (IVP)
+
+ FRAME_END
+ ret
+ENDPROC(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+ENTRY(aesni_xts_decrypt)
+ FRAME_BEGIN
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+
+.Lxts_dec_loop4:
+ movdqa IV, STATE1
+ movdqu 0x00(INP), INC
+ pxor INC, STATE1
+ movdqu IV, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ movdqu 0x10(INP), INC
+ pxor INC, STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), INC
+ pxor INC, STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ movdqu 0x30(INP), INC
+ pxor INC, STATE4
+ movdqu IV, 0x30(OUTP)
+
+ call _aesni_dec4
+
+ movdqu 0x00(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), INC
+ pxor INC, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ movdqu 0x20(OUTP), INC
+ pxor INC, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ movdqu 0x30(OUTP), INC
+ pxor INC, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+
+ add $64, INP
+ add $64, OUTP
+ sub $64, LEN
+ ja .Lxts_dec_loop4
+
+ movups IV, (IVP)
+
+ FRAME_END
+ ret
+ENDPROC(aesni_xts_decrypt)
+
+#endif
diff --git a/marvell/linux/arch/x86/crypto/aesni-intel_avx-x86_64.S b/marvell/linux/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 0000000..4e4d349
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2843 @@
+########################################################################
+# Copyright (c) 2013, Intel Corporation
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
+# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+##
+## Authors:
+## Erdinc Ozturk <erdinc.ozturk@intel.com>
+## Vinodh Gopal <vinodh.gopal@intel.com>
+## James Guilford <james.guilford@intel.com>
+## Tim Chen <tim.c.chen@linux.intel.com>
+##
+## References:
+## This code was derived and highly optimized from the code described in paper:
+## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
+## on Intel Architecture Processors. August, 2010
+## The details of the implementation is explained in:
+## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
+## on Intel Architecture Processors. October, 2012.
+##
+## Assumptions:
+##
+##
+##
+## iv:
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Salt (From the SA) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Initialization Vector |
+## | (This is the sequence number from IPSec header) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x1 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##
+##
+## AAD:
+## AAD padded to 128 bits with 0
+## for example, assume AAD is a u32 vector
+##
+## if AAD is 8 bytes:
+## AAD[3] = {A0, A1}#
+## padded AAD in xmm register = {A1 A0 0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A1) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 32-bit Sequence Number (A0) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 32-bit Sequence Number
+##
+## if AAD is 12 bytes:
+## AAD[3] = {A0, A1, A2}#
+## padded AAD in xmm register = {A2 A1 A0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A2) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 64-bit Extended Sequence Number {A1,A0} |
+## | |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 64-bit Extended Sequence Number
+##
+##
+## aadLen:
+## from the definition of the spec, aadLen can only be 8 or 12 bytes.
+## The code additionally supports aadLen of length 16 bytes.
+##
+## TLen:
+## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+##
+## poly = x^128 + x^127 + x^126 + x^121 + 1
+## throughout the code, one tab and two tab indentations are used. one tab is
+## for GHASH part, two tabs is for AES part.
+##
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+# constants in mergeable sections, linker can reorder and merge
+.section .rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
+POLY: .octa 0xC2000000000000000000000000000001
+
+.section .rodata.cst16.POLY2, "aM", @progbits, 16
+.align 16
+POLY2: .octa 0xC20000000000000000000001C2000000
+
+.section .rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
+TWOONE: .octa 0x00000001000000000000000000000001
+
+.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+
+.section .rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE: .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst16.ONEf, "aM", @progbits, 16
+.align 16
+ONEf: .octa 0x01000000000000000000000000000000
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
+.section .rodata, "a", @progbits
+.align 16
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0x00000000000000000000000000000000
+
+.section .rodata
+.align 16
+.type aad_shift_arr, @object
+.size aad_shift_arr, 272
+aad_shift_arr:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0xffffffffffffffffffffffffffffff0C
+ .octa 0xffffffffffffffffffffffffffff0D0C
+ .octa 0xffffffffffffffffffffffffff0E0D0C
+ .octa 0xffffffffffffffffffffffff0F0E0D0C
+ .octa 0xffffffffffffffffffffff0C0B0A0908
+ .octa 0xffffffffffffffffffff0D0C0B0A0908
+ .octa 0xffffffffffffffffff0E0D0C0B0A0908
+ .octa 0xffffffffffffffff0F0E0D0C0B0A0908
+ .octa 0xffffffffffffff0C0B0A090807060504
+ .octa 0xffffffffffff0D0C0B0A090807060504
+ .octa 0xffffffffff0E0D0C0B0A090807060504
+ .octa 0xffffffff0F0E0D0C0B0A090807060504
+ .octa 0xffffff0C0B0A09080706050403020100
+ .octa 0xffff0D0C0B0A09080706050403020100
+ .octa 0xff0E0D0C0B0A09080706050403020100
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+
+.text
+
+
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
+HashKey = 16*6 # store HashKey <<1 mod poly here
+HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
+HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
+HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
+HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
+HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
+HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
+HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
+HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+#define arg4 %rcx
+#define arg5 %r8
+#define arg6 %r9
+#define arg7 STACK_OFFSET+8*1(%r14)
+#define arg8 STACK_OFFSET+8*2(%r14)
+#define arg9 STACK_OFFSET+8*3(%r14)
+#define arg10 STACK_OFFSET+8*4(%r14)
+#define keysize 2*15*16(arg1)
+
+i = 0
+j = 0
+
+out_order = 0
+in_order = 1
+DEC = 0
+ENC = 1
+
+.macro define_reg r n
+reg_\r = %xmm\n
+.endm
+
+.macro setreg
+.altmacro
+define_reg i %i
+define_reg j %j
+.noaltmacro
+.endm
+
+# need to push 4 registers into stack to maintain
+STACK_OFFSET = 8*4
+
+TMP1 = 16*0 # Temporary storage for AAD
+TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+TMP3 = 16*2 # Temporary storage for AES State 3
+TMP4 = 16*3 # Temporary storage for AES State 4
+TMP5 = 16*4 # Temporary storage for AES State 5
+TMP6 = 16*5 # Temporary storage for AES State 6
+TMP7 = 16*6 # Temporary storage for AES State 7
+TMP8 = 16*7 # Temporary storage for AES State 8
+
+VARIABLE_OFFSET = 16*8
+
+################################
+# Utility Macros
+################################
+
+.macro FUNC_SAVE
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+.endm
+
+.macro FUNC_RESTORE
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+# Encryption of a single block
+.macro ENCRYPT_SINGLE_BLOCK REP XMM0
+ vpxor (arg1), \XMM0, \XMM0
+ i = 1
+ setreg
+.rep \REP
+ vaesenc 16*i(arg1), \XMM0, \XMM0
+ i = (i+1)
+ setreg
+.endr
+ vaesenclast 16*i(arg1), \XMM0, \XMM0
+.endm
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
+ vmovdqu AadHash(arg2), %xmm8
+ vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
+ add arg5, InLen(arg2)
+
+ # initialize the data pointer offset as zero
+ xor %r11d, %r11d
+
+ PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
+ sub %r11, arg5
+
+ mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ test %r13, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ vmovdqu %xmm14, AadHash(arg2)
+ vmovdqu %xmm9, CurCount(arg2)
+
+ # check for 0 length
+ mov arg5, %r13
+ and $15, %r13 # r13 = (arg5 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block separately
+
+ mov %r13, PBlockLen(arg2)
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vmovdqu %xmm9, CurCount(arg2)
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
+ vmovdqu %xmm9, PBlockEncKey(arg2)
+
+ cmp $16, arg5
+ jge _large_enough_update\@
+
+ lea (arg4,%r11,1), %r10
+ mov %r13, %r12
+
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+ jmp _final_ghash_mul\@
+
+_large_enough_update\@:
+ sub $16, %r11
+ add %r13, %r11
+
+ # receive the last <16 Byte block
+ vmovdqu (arg4, %r11, 1), %xmm1
+
+ sub %r13, %r11
+ add $16, %r11
+
+ lea SHIFT_MASK+16(%rip), %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ sub %r13, %r12
+ # get the appropriate shuffle mask
+ vmovdqu (%r12), %xmm2
+ # shift right 16-r13 bytes
+ vpshufb %xmm2, %xmm1, %xmm1
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+
+ vmovdqu %xmm14, AadHash(arg2)
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+
+ vmovdqu %xmm14, AadHash(arg2)
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg3 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg3 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+.endm
+
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
+ vmovdqu AadHash(arg2), %xmm14
+ vmovdqu HashKey(arg2), %xmm13
+
+ mov PBlockLen(arg2), %r12
+ test %r12, %r12
+ je _partial_done\@
+
+ #GHASH computation for the last <16 Byte block
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
+ mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ mov InLen(arg2), %r12
+ shl $3, %r12 # len(C) in bits (*128)
+ vmovq %r12, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ vmovdqu OrigIV(arg2), %xmm9
+
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov \AUTH_TAG, %r10 # r10 = authTag
+ mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $8, %r11
+ jl _T_4\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ test %r11, %r11
+ je _return_T_done\@
+_T_4\@:
+ vmovd %xmm9, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ test %r11, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+.endm
+
+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
+
+ mov \AAD, %r10 # r10 = AAD
+ mov \AADLEN, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor \T8, \T8, \T8
+ vpxor \T7, \T7, \T7
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), \T7
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T7, \T8, \T8
+ \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu \T8, \T7
+ test %r11, %r11
+ je _get_AAD_done\@
+
+ vpxor \T7, \T7, \T7
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, \T7, \T7
+ vpxor \T1, \T7, \T7
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ test %r11, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, \T7, \T7
+ vpxor \T1, \T7, \T7
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ vmovdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, \T7, \T7
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T8, \T7, \T7
+ \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+ vmovdqu \T7, AadHash(arg2)
+.endm
+
+.macro INIT GHASH_MUL PRECOMPUTE
+ mov arg6, %r11
+ mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
+ xor %r11d, %r11d
+ mov %r11, InLen(arg2) # ctx_data.in_length = 0
+
+ mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
+ mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
+ mov arg3, %rax
+ movdqu (%rax), %xmm0
+ movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
+
+ vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
+ movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
+
+ vmovdqu (arg4), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
+
+ CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
+
+ \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+.endm
+
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+ vpxor \XMMDst, \XMMDst, \XMMDst
+
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ vpinsrq $1, %rax, \XMMDst, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+ AAD_HASH ENC_DEC
+ mov PBlockLen(arg2), %r13
+ test %r13, %r13
+ je _partial_block_done_\@ # Leave Macro if no partial blocks
+ # Read in input data without over reading
+ cmp $16, \PLAIN_CYPH_LEN
+ jl _fewer_than_16_bytes_\@
+ vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+ jmp _data_read_\@
+
+_fewer_than_16_bytes_\@:
+ lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+ mov \PLAIN_CYPH_LEN, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+ mov PBlockLen(arg2), %r13
+
+_data_read_\@: # Finished reading in data
+
+ vmovdqu PBlockEncKey(arg2), %xmm9
+ vmovdqu HashKey(arg2), %xmm13
+
+ lea SHIFT_MASK(%rip), %r12
+
+ # adjust the shuffle mask pointer to be able to shift r13 bytes
+ # r16-r13 is the number of bytes in plaintext mod 16)
+ add %r13, %r12
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
+
+.if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm3
+ pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_1_\@
+ sub %r10, %r12
+_no_extra_mask_1_\@:
+
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
+
+ vpand %xmm1, %xmm3, %xmm3
+ vmovdqa SHUF_MASK(%rip), %xmm10
+ vpshufb %xmm10, %xmm3, %xmm3
+ vpshufb %xmm2, %xmm3, %xmm3
+ vpxor %xmm3, \AAD_HASH, \AAD_HASH
+
+ test %r10, %r10
+ jl _partial_incomplete_1_\@
+
+ # GHASH computation for the last <16 Byte block
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax,%eax
+
+ mov %rax, PBlockLen(arg2)
+ jmp _dec_done_\@
+_partial_incomplete_1_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
+_dec_done_\@:
+ vmovdqu \AAD_HASH, AadHash(arg2)
+.else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_2_\@
+ sub %r10, %r12
+_no_extra_mask_2_\@:
+
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9
+
+ vmovdqa SHUF_MASK(%rip), %xmm1
+ vpshufb %xmm1, %xmm9, %xmm9
+ vpshufb %xmm2, %xmm9, %xmm9
+ vpxor %xmm9, \AAD_HASH, \AAD_HASH
+
+ test %r10, %r10
+ jl _partial_incomplete_2_\@
+
+ # GHASH computation for the last <16 Byte block
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax,%eax
+
+ mov %rax, PBlockLen(arg2)
+ jmp _encode_done_\@
+_partial_incomplete_2_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
+_encode_done_\@:
+ vmovdqu \AAD_HASH, AadHash(arg2)
+
+ vmovdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm9 back to output as ciphertext
+ vpshufb %xmm10, %xmm9, %xmm9
+ vpshufb %xmm2, %xmm9, %xmm9
+.endif
+ # output encrypted Bytes
+ test %r10, %r10
+ jl _partial_fill_\@
+ mov %r13, %r12
+ mov $16, %r13
+ # Set r13 to be the number of bytes to write out
+ sub %r12, %r13
+ jmp _count_set_\@
+_partial_fill_\@:
+ mov \PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+ vmovdqa %xmm9, %xmm0
+ vmovq %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+
+ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $8, \DATA_OFFSET
+ psrldq $8, %xmm0
+ vmovq %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $1, \DATA_OFFSET
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
+
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+
+ vpshufd $0b01001110, \GH, \T2
+ vpshufd $0b01001110, \HK, \T3
+ vpxor \GH , \T2, \T2 # T2 = (a1+a0)
+ vpxor \HK , \T3, \T3 # T3 = (b1+b0)
+
+ vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
+ vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
+ vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
+ vpxor \GH, \T2,\T2
+ vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
+
+ vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
+ vpxor \T3, \GH, \GH
+ vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
+
+ #first phase of the reduction
+ vpslld $31, \GH, \T2 # packed right shifting << 31
+ vpslld $30, \GH, \T3 # packed right shifting shift << 30
+ vpslld $25, \GH, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+
+ #second phase of the reduction
+
+ vpsrld $1,\GH, \T2 # packed left shifting >> 1
+ vpsrld $2,\GH, \T3 # packed left shifting >> 2
+ vpsrld $7,\GH, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T5, \T2, \T2
+ vpxor \T2, \GH, \GH
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_2_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqu \T5, HashKey_3(arg2)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_3_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqu \T5, HashKey_4(arg2)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_4_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqu \T5, HashKey_5(arg2)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_5_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqu \T5, HashKey_6(arg2)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_6_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqu \T5, HashKey_7(arg2)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_7_k(arg2)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqu \T5, HashKey_8(arg2)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqu \T1, HashKey_8_k(arg2)
+
+.endm
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+ i = (8-\num_initial_blocks)
+ setreg
+ vmovdqu AadHash(arg2), reg_i
+
+ # start AES for num_initial_blocks blocks
+ vmovdqu CurCount(arg2), \CTR
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep \REP
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg4, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep \REP # do REP rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg4, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg4, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg4, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg4, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg4, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg4, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg4, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg4, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+.endm
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg3, arg4 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqu HashKey_8(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+
+ vpshufd $0b01001110, \T2, \T6
+ vpxor \T2, \T6, \T6
+
+ vmovdqu HashKey_8_k(arg2), \T5
+ vpclmulqdq $0x00, \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqu HashKey_7(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_7_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqu HashKey_6(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_6_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqu HashKey_5(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_5_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqu HashKey_4(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_4_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqu HashKey_3(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_3_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqu HashKey_2(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_2_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqu HashKey(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqu HashKey_k(arg2), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vpxor \T4, \T6, \T6
+ vpxor \T7, \T6, \T6
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 11
+ setreg
+.rep (\REP-9)
+
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqu 16*i(arg1), \T5
+ i = i + 1
+ setreg
+.endr
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg4, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg4, %r11), reg_j
+ vmovdqu \T3, 16*i(arg3, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ #######################################################################
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
+ .endif
+
+ #######################################################################
+ #second phase of the reduction
+ vpsrld $1, \T7, \T2 # packed left shifting >> 1
+ vpsrld $2, \T7, \T3 # packed left shifting >> 2
+ vpsrld $7, \T7, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T1, \T2, \T2
+ vpxor \T2, \T7, \T7
+ vpxor \T7, \T6, \T6 # the result is in T6
+ #######################################################################
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+
+ vpxor \T6, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+ ## Karatsuba Method
+
+
+ vpshufd $0b01001110, \XMM1, \T2
+ vpxor \XMM1, \T2, \T2
+ vmovdqu HashKey_8(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM1, \T6
+ vpclmulqdq $0x00, \T5, \XMM1, \T7
+
+ vmovdqu HashKey_8_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM2, \T2
+ vpxor \XMM2, \T2, \T2
+ vmovdqu HashKey_7(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM2, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM2, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_7_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM3, \T2
+ vpxor \XMM3, \T2, \T2
+ vmovdqu HashKey_6(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM3, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM3, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_6_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM4, \T2
+ vpxor \XMM4, \T2, \T2
+ vmovdqu HashKey_5(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM4, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM4, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_5_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM5, \T2
+ vpxor \XMM5, \T2, \T2
+ vmovdqu HashKey_4(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM5, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM5, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_4_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM6, \T2
+ vpxor \XMM6, \T2, \T2
+ vmovdqu HashKey_3(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM6, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM6, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_3_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM7, \T2
+ vpxor \XMM7, \T2, \T2
+ vmovdqu HashKey_2(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM7, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM7, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_2_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM8, \T2
+ vpxor \XMM8, \T2, \T2
+ vmovdqu HashKey(arg2), \T5
+ vpclmulqdq $0x11, \T5, \XMM8, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM8, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqu HashKey_k(arg2), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+ vpxor \T6, \XMM1, \XMM1
+ vpxor \T7, \XMM1, \T2
+
+
+
+
+ vpslldq $8, \T2, \T4
+ vpsrldq $8, \T2, \T2
+
+ vpxor \T4, \T7, \T7
+ vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
+ # the accumulated carry-less multiplications
+
+ #######################################################################
+ #first phase of the reduction
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+
+
+ #second phase of the reduction
+ vpsrld $1, \T7, \T2 # packed left shifting >> 1
+ vpsrld $2, \T7, \T3 # packed left shifting >> 2
+ vpsrld $7, \T7, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T1, \T2, \T2
+ vpxor \T2, \T7, \T7
+ vpxor \T7, \T6, \T6 # the result is in T6
+
+.endm
+
+#############################################################
+#void aesni_gcm_precomp_avx_gen2
+# (gcm_data *my_ctx_data,
+# gcm_context_data *data,
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#############################################################
+ENTRY(aesni_gcm_init_avx_gen2)
+ FUNC_SAVE
+ INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_init_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_enc_update_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_enc_update_avx_gen2)
+ FUNC_SAVE
+ mov keysize, %eax
+ cmp $32, %eax
+ je key_256_enc_update
+ cmp $16, %eax
+ je key_128_enc_update
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
+ FUNC_RESTORE
+ ret
+key_128_enc_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
+ FUNC_RESTORE
+ ret
+key_256_enc_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_dec_update_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen2)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_dec_update
+ cmp $16, %eax
+ je key_128_dec_update
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
+ FUNC_RESTORE
+ ret
+key_128_dec_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
+ FUNC_RESTORE
+ ret
+key_256_dec_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_finalize_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_finalize_avx_gen2)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_finalize
+ cmp $16, %eax
+ je key_128_finalize
+ # must be 192
+ GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_128_finalize:
+ GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_256_finalize:
+ GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize_avx_gen2)
+
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
+
+ vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
+ vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
+ vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
+ vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
+ vpxor \T3, \GH, \GH
+
+
+ vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
+ vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
+
+ vpxor \T3, \T1, \T1
+ vpxor \T2, \GH, \GH
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \GH, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
+
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+ #######################################################################
+ #second phase of the reduction
+ vpclmulqdq $0x00, \GH, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \GH, \T3, \GH
+ vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \GH, \GH # second phase of the reduction complete
+ #######################################################################
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqu \T5, HashKey_3(arg2)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqu \T5, HashKey_4(arg2)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqu \T5, HashKey_5(arg2)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqu \T5, HashKey_6(arg2)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqu \T5, HashKey_7(arg2)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqu \T5, HashKey_8(arg2)
+
+.endm
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+ i = (8-\num_initial_blocks)
+ setreg
+ vmovdqu AadHash(arg2), reg_i
+
+ # start AES for num_initial_blocks blocks
+ vmovdqu CurCount(arg2), \CTR
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep \REP
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg4, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
+ # num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep \REP # do REP rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg4, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg4, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg4, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg4, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg4, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg4, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg4, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg4, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
+ # the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+
+.endm
+
+
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg3, arg4 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqu HashKey_8(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+ vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
+ vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
+ vpxor \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqu HashKey_7(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqu HashKey_6(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqu HashKey_5(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqu HashKey_4(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqu HashKey_3(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqu HashKey_2(arg2), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqu HashKey(arg2), \T5
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T1
+
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 11
+ setreg
+.rep (\REP-9)
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqu 16*i(arg1), \T5
+ i = i + 1
+ setreg
+.endr
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg4, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg4, %r11), reg_j
+ vmovdqu \T3, 16*i(arg3, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \T7, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
+
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
+ .endif
+
+ #######################################################################
+ #second phase of the reduction
+ vpclmulqdq $0x00, \T7, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \T7, \T3, \T4
+ vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \T4, \T4 # second phase of the reduction complete
+ #######################################################################
+ vpxor \T4, \T1, \T1 # the result is in T1
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+
+ vpxor \T1, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+ ## Karatsuba Method
+
+ vmovdqu HashKey_8(arg2), \T5
+
+ vpshufd $0b01001110, \XMM1, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM1, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM1, \T6
+ vpclmulqdq $0x00, \T5, \XMM1, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \XMM1
+
+ ######################
+
+ vmovdqu HashKey_7(arg2), \T5
+ vpshufd $0b01001110, \XMM2, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM2, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM2, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM2, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqu HashKey_6(arg2), \T5
+ vpshufd $0b01001110, \XMM3, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM3, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM3, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM3, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqu HashKey_5(arg2), \T5
+ vpshufd $0b01001110, \XMM4, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM4, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM4, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM4, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqu HashKey_4(arg2), \T5
+ vpshufd $0b01001110, \XMM5, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM5, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM5, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM5, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqu HashKey_3(arg2), \T5
+ vpshufd $0b01001110, \XMM6, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM6, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM6, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM6, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqu HashKey_2(arg2), \T5
+ vpshufd $0b01001110, \XMM7, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM7, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM7, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM7, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqu HashKey(arg2), \T5
+ vpshufd $0b01001110, \XMM8, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM8, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM8, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM8, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+ vpxor \T6, \XMM1, \XMM1
+ vpxor \T7, \XMM1, \T2
+
+
+
+
+ vpslldq $8, \T2, \T4
+ vpsrldq $8, \T2, \T2
+
+ vpxor \T4, \T7, \T7
+ vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
+ # accumulated carry-less multiplications
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \T7, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
+
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+
+
+ #second phase of the reduction
+ vpclmulqdq $0x00, \T7, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \T7, \T3, \T4
+ vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \T4, \T4 # second phase of the reduction complete
+ #######################################################################
+ vpxor \T4, \T6, \T6 # the result is in T6
+.endm
+
+
+
+#############################################################
+#void aesni_gcm_init_avx_gen4
+# (gcm_data *my_ctx_data,
+# gcm_context_data *data,
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#############################################################
+ENTRY(aesni_gcm_init_avx_gen4)
+ FUNC_SAVE
+ INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_init_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_enc_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_enc_update_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_enc_update4
+ cmp $16, %eax
+ je key_128_enc_update4
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
+ FUNC_RESTORE
+ ret
+key_128_enc_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
+ FUNC_RESTORE
+ ret
+key_256_enc_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_dec_update_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_dec_update4
+ cmp $16, %eax
+ je key_128_dec_update4
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
+ FUNC_RESTORE
+ ret
+key_128_dec_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
+ FUNC_RESTORE
+ ret
+key_256_dec_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_finalize_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_finalize_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_finalize4
+ cmp $16, %eax
+ je key_128_finalize4
+ # must be 192
+ GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_128_finalize4:
+ GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_256_finalize4:
+ GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize_avx_gen4)
+
+#endif /* CONFIG_AS_AVX2 */
diff --git a/marvell/linux/arch/x86/crypto/aesni-intel_glue.c b/marvell/linux/arch/x86/crypto/aesni-intel_glue.c
new file mode 100644
index 0000000..18cfb76
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/aesni-intel_glue.c
@@ -0,0 +1,1155 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Support for Intel AES-NI instructions. This file contains glue
+ * code, the real AES implementation is in intel-aes_asm.S.
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Adrian Hoban <adrian.hoban@intel.com>
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+#include <crypto/ctr.h>
+#include <crypto/b128ops.h>
+#include <crypto/gcm.h>
+#include <crypto/xts.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif
+
+
+#define AESNI_ALIGN 16
+#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
+#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
+#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
+#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
+#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
+
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+ u8 hash_subkey[16] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
+ u8 nonce[4];
+};
+
+struct generic_gcmaes_ctx {
+ u8 hash_subkey[16] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
+};
+
+struct aesni_xts_ctx {
+ u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
+ u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
+};
+
+#define GCM_BLOCK_LEN 16
+
+struct gcm_context_data {
+ /* init, update and finalize context data */
+ u8 aad_hash[GCM_BLOCK_LEN];
+ u64 aad_length;
+ u64 in_length;
+ u8 partial_block_enc_key[GCM_BLOCK_LEN];
+ u8 orig_IV[GCM_BLOCK_LEN];
+ u8 current_counter[GCM_BLOCK_LEN];
+ u64 partial_block_len;
+ u64 unused;
+ u8 hash_keys[GCM_BLOCK_LEN * 16];
+};
+
+asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
+asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len);
+asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len);
+asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+#define AVX_GEN2_OPTSIZE 640
+#define AVX_GEN4_OPTSIZE 4096
+
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+#ifdef CONFIG_X86_64
+
+static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data. May be uninitialized.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
+ * 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes.
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data. May be uninitialized.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
+ * 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+/* Scatter / Gather routines, with args similar to above */
+asmlinkage void aesni_gcm_init(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey, const u8 *aad,
+ unsigned long aad_len);
+asmlinkage void aesni_gcm_enc_update(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static const struct aesni_gcm_tfm_s {
+ void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len);
+ void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+ void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len);
+ void (*finalize)(void *ctx, struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+} *aesni_gcm_tfm;
+
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+ .init = &aesni_gcm_init,
+ .enc_update = &aesni_gcm_enc_update,
+ .dec_update = &aesni_gcm_dec_update,
+ .finalize = &aesni_gcm_finalize,
+};
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+/*
+ * asmlinkage void aesni_gcm_init_avx_gen2()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey,
+ const u8 *aad,
+ unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+ .init = &aesni_gcm_init_avx_gen2,
+ .enc_update = &aesni_gcm_enc_update_avx_gen2,
+ .dec_update = &aesni_gcm_dec_update_avx_gen2,
+ .finalize = &aesni_gcm_finalize_avx_gen2,
+};
+
+#endif
+
+#ifdef CONFIG_AS_AVX2
+/*
+ * asmlinkage void aesni_gcm_init_avx_gen4()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey,
+ const u8 *aad,
+ unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+ .init = &aesni_gcm_init_avx_gen4,
+ .enc_update = &aesni_gcm_enc_update_avx_gen4,
+ .dec_update = &aesni_gcm_dec_update_avx_gen4,
+ .finalize = &aesni_gcm_finalize_avx_gen4,
+};
+
+#endif
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+}
+
+static inline struct
+generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
+{
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+}
+#endif
+
+static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
+{
+ unsigned long addr = (unsigned long)raw_ctx;
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return (struct crypto_aes_ctx *)ALIGN(addr, align);
+}
+
+static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
+ const u8 *in_key, unsigned int key_len)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
+ u32 *flags = &tfm->crt_flags;
+ int err;
+
+ if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256) {
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ if (!crypto_simd_usable())
+ err = aes_expandkey(ctx, in_key, key_len);
+ else {
+ kernel_fpu_begin();
+ err = aesni_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ }
+
+ return err;
+}
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
+}
+
+static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ if (!crypto_simd_usable()) {
+ aes_encrypt(ctx, dst, src);
+ } else {
+ kernel_fpu_begin();
+ aesni_enc(ctx, dst, src);
+ kernel_fpu_end();
+ }
+}
+
+static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ if (!crypto_simd_usable()) {
+ aes_decrypt(ctx, dst, src);
+ } else {
+ kernel_fpu_begin();
+ aesni_dec(ctx, dst, src);
+ kernel_fpu_end();
+ }
+}
+
+static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int len)
+{
+ return aes_set_key_common(crypto_skcipher_tfm(tfm),
+ crypto_skcipher_ctx(tfm), key, len);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+#ifdef CONFIG_X86_64
+static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[AES_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ aesni_enc(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+#ifdef CONFIG_AS_AVX
+static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv)
+{
+ /*
+ * based on key length, override with the by8 version
+ * of ctr mode encryption/decryption for improved performance
+ * aes_set_key_common() ensures that key length is one of
+ * {128,192,256}
+ */
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
+ else
+ aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
+}
+#endif
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+ aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ if (walk.nbytes) {
+ ctr_crypt_final(ctx, &walk);
+ err = skcipher_walk_done(&walk, 0);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ keylen /= 2;
+
+ /* first half of xts-key is for crypt */
+ err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
+ key, keylen);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
+ key + keylen, keylen);
+}
+
+
+static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc);
+}
+
+static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
+}
+
+static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+}
+
+static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+}
+
+static const struct common_glue_ctx aesni_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+ .num_blocks = 32,
+ .fn_u = { .xts = aesni_xts_enc32 }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = aesni_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx aesni_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+ .num_blocks = 32,
+ .fn_u = { .xts = aesni_xts_dec32 }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = aesni_xts_dec }
+ } }
+};
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc,
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx),
+ false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc,
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx),
+ true);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+ struct crypto_aes_ctx ctx;
+ int ret;
+
+ ret = aes_expandkey(&ctx, key, key_len);
+ if (ret)
+ return ret;
+
+ /* Clear the data in the hash sub key container to zero.*/
+ /* We want to cipher all zeros to create the hash sub key. */
+ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+ aes_encrypt(&ctx, hash_subkey, hash_subkey);
+
+ memzero_explicit(&ctx, sizeof(ctx));
+ return 0;
+}
+
+static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
+ unsigned int key_len)
+{
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
+
+ if (key_len < 4) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ /*Account for 4 byte nonce at the end.*/
+ key_len -= 4;
+
+ memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+
+ return aes_set_key_common(crypto_aead_tfm(aead),
+ &ctx->aes_key_expanded, key, key_len) ?:
+ rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+}
+
+/* This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long. */
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
+ unsigned int assoclen, u8 *hash_subkey,
+ u8 *iv, void *aes_ctx)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
+ u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8);
+ struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN);
+ struct scatter_walk dst_sg_walk = {};
+ unsigned long left = req->cryptlen;
+ unsigned long len, srclen, dstlen;
+ struct scatter_walk assoc_sg_walk;
+ struct scatter_walk src_sg_walk;
+ struct scatterlist src_start[2];
+ struct scatterlist dst_start[2];
+ struct scatterlist *src_sg;
+ struct scatterlist *dst_sg;
+ u8 *src, *dst, *assoc;
+ u8 *assocmem = NULL;
+ u8 authTag[16];
+
+ if (!enc)
+ left -= auth_tag_len;
+
+#ifdef CONFIG_AS_AVX2
+ if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
+ gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+#endif
+#ifdef CONFIG_AS_AVX
+ if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
+ gcm_tfm = &aesni_gcm_tfm_sse;
+#endif
+
+ /* Linearize assoc, if not already linear */
+ if (req->src->length >= assoclen && req->src->length &&
+ (!PageHighMem(sg_page(req->src)) ||
+ req->src->offset + req->src->length <= PAGE_SIZE)) {
+ scatterwalk_start(&assoc_sg_walk, req->src);
+ assoc = scatterwalk_map(&assoc_sg_walk);
+ } else {
+ /* assoc can be any length, so must be on heap */
+ assocmem = kmalloc(assoclen, GFP_ATOMIC);
+ if (unlikely(!assocmem))
+ return -ENOMEM;
+ assoc = assocmem;
+
+ scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
+ }
+
+ if (left) {
+ src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+ scatterwalk_start(&src_sg_walk, src_sg);
+ if (req->src != req->dst) {
+ dst_sg = scatterwalk_ffwd(dst_start, req->dst,
+ req->assoclen);
+ scatterwalk_start(&dst_sg_walk, dst_sg);
+ }
+ }
+
+ kernel_fpu_begin();
+ gcm_tfm->init(aes_ctx, data, iv, hash_subkey, assoc, assoclen);
+ if (req->src != req->dst) {
+ while (left) {
+ src = scatterwalk_map(&src_sg_walk);
+ dst = scatterwalk_map(&dst_sg_walk);
+ srclen = scatterwalk_clamp(&src_sg_walk, left);
+ dstlen = scatterwalk_clamp(&dst_sg_walk, left);
+ len = min(srclen, dstlen);
+ if (len) {
+ if (enc)
+ gcm_tfm->enc_update(aes_ctx, data,
+ dst, src, len);
+ else
+ gcm_tfm->dec_update(aes_ctx, data,
+ dst, src, len);
+ }
+ left -= len;
+
+ scatterwalk_unmap(src);
+ scatterwalk_unmap(dst);
+ scatterwalk_advance(&src_sg_walk, len);
+ scatterwalk_advance(&dst_sg_walk, len);
+ scatterwalk_done(&src_sg_walk, 0, left);
+ scatterwalk_done(&dst_sg_walk, 1, left);
+ }
+ } else {
+ while (left) {
+ dst = src = scatterwalk_map(&src_sg_walk);
+ len = scatterwalk_clamp(&src_sg_walk, left);
+ if (len) {
+ if (enc)
+ gcm_tfm->enc_update(aes_ctx, data,
+ src, src, len);
+ else
+ gcm_tfm->dec_update(aes_ctx, data,
+ src, src, len);
+ }
+ left -= len;
+ scatterwalk_unmap(src);
+ scatterwalk_advance(&src_sg_walk, len);
+ scatterwalk_done(&src_sg_walk, 1, left);
+ }
+ }
+ gcm_tfm->finalize(aes_ctx, data, authTag, auth_tag_len);
+ kernel_fpu_end();
+
+ if (!assocmem)
+ scatterwalk_unmap(assoc);
+ else
+ kfree(assocmem);
+
+ if (!enc) {
+ u8 authTagMsg[16];
+
+ /* Copy out original authTag */
+ scatterwalk_map_and_copy(authTagMsg, req->src,
+ req->assoclen + req->cryptlen -
+ auth_tag_len,
+ auth_tag_len, 0);
+
+ /* Compare generated tag with passed in tag. */
+ return crypto_memneq(authTagMsg, authTag, auth_tag_len) ?
+ -EBADMSG : 0;
+ }
+
+ /* Copy in the authTag */
+ scatterwalk_map_and_copy(authTag, req->dst,
+ req->assoclen + req->cryptlen,
+ auth_tag_len, 1);
+
+ return 0;
+}
+
+static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
+ u8 *hash_subkey, u8 *iv, void *aes_ctx)
+{
+ return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+ aes_ctx);
+}
+
+static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
+ u8 *hash_subkey, u8 *iv, void *aes_ctx)
+{
+ return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+ aes_ctx);
+}
+
+static int helper_rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
+ unsigned int i;
+ __be32 counter = cpu_to_be32(1);
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length equal */
+ /* to 16 or 20 bytes */
+ if (unlikely(req->assoclen != 16 && req->assoclen != 20))
+ return -EINVAL;
+
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int helper_rfc4106_decrypt(struct aead_request *req)
+{
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
+ unsigned int i;
+
+ if (unlikely(req->assoclen != 16 && req->assoclen != 20))
+ return -EINVAL;
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length */
+ /* equal to 16 or 20 bytes */
+
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+#endif
+
+static struct crypto_alg aesni_cipher_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-aesni",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = aesni_encrypt,
+ .cia_decrypt = aesni_decrypt
+ }
+ }
+};
+
+static struct skcipher_alg aesni_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(aes)",
+ .cra_driver_name = "__ecb-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(aes)",
+ .cra_driver_name = "__cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+#ifdef CONFIG_X86_64
+ }, {
+ .base = {
+ .cra_name = "__ctr(aes)",
+ .cra_driver_name = "__ctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base = {
+ .cra_name = "__xts(aes)",
+ .cra_driver_name = "__xts-aes-aesni",
+ .cra_priority = 401,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = XTS_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = xts_aesni_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+#endif
+ }
+};
+
+static
+struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
+
+#ifdef CONFIG_X86_64
+static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
+ unsigned int key_len)
+{
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
+
+ return aes_set_key_common(crypto_aead_tfm(aead),
+ &ctx->aes_key_expanded, key, key_len) ?:
+ rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+}
+
+static int generic_gcmaes_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
+ __be32 counter = cpu_to_be32(1);
+
+ memcpy(iv, req->iv, 12);
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int generic_gcmaes_decrypt(struct aead_request *req)
+{
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
+
+ memcpy(iv, req->iv, 12);
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static struct aead_alg aesni_aeads[] = { {
+ .setkey = common_rfc4106_set_key,
+ .setauthsize = common_rfc4106_set_authsize,
+ .encrypt = helper_rfc4106_encrypt,
+ .decrypt = helper_rfc4106_decrypt,
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "__rfc4106(gcm(aes))",
+ .cra_driver_name = "__rfc4106-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
+ .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ },
+}, {
+ .setkey = generic_gcmaes_set_key,
+ .setauthsize = generic_gcmaes_set_authsize,
+ .encrypt = generic_gcmaes_encrypt,
+ .decrypt = generic_gcmaes_decrypt,
+ .ivsize = GCM_AES_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "__gcm(aes)",
+ .cra_driver_name = "__generic-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
+ .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ },
+} };
+#else
+static struct aead_alg aesni_aeads[0];
+#endif
+
+static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_AES),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init aesni_init(void)
+{
+ int err;
+
+ if (!x86_match_cpu(aesni_cpu_id))
+ return -ENODEV;
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_AS_AVX2
+ if (boot_cpu_has(X86_FEATURE_AVX2)) {
+ pr_info("AVX2 version of gcm_enc/dec engaged.\n");
+ aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
+ } else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (boot_cpu_has(X86_FEATURE_AVX)) {
+ pr_info("AVX version of gcm_enc/dec engaged.\n");
+ aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+ } else
+#endif
+ {
+ pr_info("SSE version of gcm_enc/dec engaged.\n");
+ aesni_gcm_tfm = &aesni_gcm_tfm_sse;
+ }
+ aesni_ctr_enc_tfm = aesni_ctr_enc;
+#ifdef CONFIG_AS_AVX
+ if (boot_cpu_has(X86_FEATURE_AVX)) {
+ /* optimize performance of ctr mode encryption transform */
+ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ pr_info("AES CTR mode by8 optimization enabled\n");
+ }
+#endif
+#endif
+
+ err = crypto_register_alg(&aesni_cipher_alg);
+ if (err)
+ return err;
+
+ err = simd_register_skciphers_compat(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers),
+ aesni_simd_skciphers);
+ if (err)
+ goto unregister_cipher;
+
+ err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ aesni_simd_aeads);
+ if (err)
+ goto unregister_skciphers;
+
+ return 0;
+
+unregister_skciphers:
+ simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+ aesni_simd_skciphers);
+unregister_cipher:
+ crypto_unregister_alg(&aesni_cipher_alg);
+ return err;
+}
+
+static void __exit aesni_exit(void)
+{
+ simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ aesni_simd_aeads);
+ simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+ aesni_simd_skciphers);
+ crypto_unregister_alg(&aesni_cipher_alg);
+}
+
+late_initcall(aesni_init);
+module_exit(aesni_exit);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/marvell/linux/arch/x86/crypto/blowfish-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/blowfish-x86_64-asm_64.S
new file mode 100644
index 0000000..330db7a
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Blowfish Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+
+.file "blowfish-x86_64-asm.S"
+.text
+
+/* structure of crypto context */
+#define p 0
+#define s0 ((16 + 2) * 4)
+#define s1 ((16 + 2 + (1 * 256)) * 4)
+#define s2 ((16 + 2 + (2 * 256)) * 4)
+#define s3 ((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX %r12
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rdi
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %edi
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+ rorq $16, RX0; \
+ movzbl RX0bh, RT0d; \
+ movzbl RX0bl, RT1d; \
+ rolq $16, RX0; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT1,4), RT0d; \
+ movzbl RX0bh, RT1d; \
+ movzbl RX0bl, RT2d; \
+ rolq $32, RX0; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT2,4), RT0d; \
+ xorq RT0, RX0;
+
+#define add_roundkey_enc(n) \
+ xorq p+4*(n)(CTX), RX0;
+
+#define round_enc(n) \
+ add_roundkey_enc(n); \
+ \
+ F(); \
+ F();
+
+#define add_roundkey_dec(n) \
+ movq p+4*(n-1)(CTX), RT0; \
+ rorq $32, RT0; \
+ xorq RT0, RX0;
+
+#define round_dec(n) \
+ add_roundkey_dec(n); \
+ \
+ F(); \
+ F(); \
+
+#define read_block() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0;
+
+#define write_block() \
+ bswapq RX0; \
+ movq RX0, (RIO);
+
+#define xor_block() \
+ bswapq RX0; \
+ xorq RX0, (RIO);
+
+ENTRY(__blowfish_enc_blk)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+ movq %r12, %r11;
+
+ movq %rdi, CTX;
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_enc(0);
+ round_enc(2);
+ round_enc(4);
+ round_enc(6);
+ round_enc(8);
+ round_enc(10);
+ round_enc(12);
+ round_enc(14);
+ add_roundkey_enc(16);
+
+ movq %r11, %r12;
+
+ movq %r10, RIO;
+ test %cl, %cl;
+ jnz .L__enc_xor;
+
+ write_block();
+ ret;
+.L__enc_xor:
+ xor_block();
+ ret;
+ENDPROC(__blowfish_enc_blk)
+
+ENTRY(blowfish_dec_blk)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %r12, %r11;
+
+ movq %rdi, CTX;
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_dec(17);
+ round_dec(15);
+ round_dec(13);
+ round_dec(11);
+ round_dec(9);
+ round_dec(7);
+ round_dec(5);
+ round_dec(3);
+ add_roundkey_dec(1);
+
+ movq %r10, RIO;
+ write_block();
+
+ movq %r11, %r12;
+
+ ret;
+ENDPROC(blowfish_dec_blk)
+
+/**********************************************************************
+ 4-way blowfish, four blocks parallel
+ **********************************************************************/
+
+/* F() for 4-way. Slower when used alone/1-way, but faster when used
+ * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
+ */
+#define F4(x) \
+ movzbl x ## bh, RT1d; \
+ movzbl x ## bl, RT3d; \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT2d; \
+ rorq $16, x; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, x;
+
+#define add_preloaded_roundkey4() \
+ xorq RKEY, RX0; \
+ xorq RKEY, RX1; \
+ xorq RKEY, RX2; \
+ xorq RKEY, RX3;
+
+#define preload_roundkey_enc(n) \
+ movq p+4*(n)(CTX), RKEY;
+
+#define add_roundkey_enc4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+ add_roundkey_enc4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define preload_roundkey_dec(n) \
+ movq p+4*((n)-1)(CTX), RKEY; \
+ rorq $32, RKEY;
+
+#define add_roundkey_dec4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+ add_roundkey_dec4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define read_block4() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0; \
+ \
+ movq 8(RIO), RX1; \
+ rorq $32, RX1; \
+ bswapq RX1; \
+ \
+ movq 16(RIO), RX2; \
+ rorq $32, RX2; \
+ bswapq RX2; \
+ \
+ movq 24(RIO), RX3; \
+ rorq $32, RX3; \
+ bswapq RX3;
+
+#define write_block4() \
+ bswapq RX0; \
+ movq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ movq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ movq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ movq RX3, 24(RIO);
+
+#define xor_block4() \
+ bswapq RX0; \
+ xorq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ xorq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ xorq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ xorq RX3, 24(RIO);
+
+ENTRY(__blowfish_enc_blk_4way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+ pushq %r12;
+ pushq %rbx;
+ pushq %rcx;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+ movq %rdx, RIO;
+
+ preload_roundkey_enc(0);
+
+ read_block4();
+
+ round_enc4(0);
+ round_enc4(2);
+ round_enc4(4);
+ round_enc4(6);
+ round_enc4(8);
+ round_enc4(10);
+ round_enc4(12);
+ round_enc4(14);
+ add_preloaded_roundkey4();
+
+ popq %r12;
+ movq %r11, RIO;
+
+ test %r12b, %r12b;
+ jnz .L__enc_xor4;
+
+ write_block4();
+
+ popq %rbx;
+ popq %r12;
+ ret;
+
+.L__enc_xor4:
+ xor_block4();
+
+ popq %rbx;
+ popq %r12;
+ ret;
+ENDPROC(__blowfish_enc_blk_4way)
+
+ENTRY(blowfish_dec_blk_4way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %r12;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11
+ movq %rdx, RIO;
+
+ preload_roundkey_dec(17);
+ read_block4();
+
+ round_dec4(17);
+ round_dec4(15);
+ round_dec4(13);
+ round_dec4(11);
+ round_dec4(9);
+ round_dec4(7);
+ round_dec4(5);
+ round_dec4(3);
+ add_preloaded_roundkey4();
+
+ movq %r11, RIO;
+ write_block4();
+
+ popq %rbx;
+ popq %r12;
+
+ ret;
+ENDPROC(blowfish_dec_blk_4way)
diff --git a/marvell/linux/arch/x86/crypto/blowfish_glue.c b/marvell/linux/arch/x86/crypto/blowfish_glue.c
new file mode 100644
index 0000000..cedfdba
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/blowfish_glue.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for assembler optimized version of Blowfish
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/blowfish.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/* regular block cipher functions */
+asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+
+/* 4-way parallel cipher functions */
+asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+ __blowfish_enc_blk(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __blowfish_enc_blk(ctx, dst, src, true);
+}
+
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __blowfish_enc_blk_4way(ctx, dst, src, true);
+}
+
+static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return blowfish_setkey(&tfm->base, key, keylen);
+}
+
+static int ecb_crypt(struct skcipher_request *req,
+ void (*fn)(struct bf_ctx *, u8 *, const u8 *),
+ void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 4) {
+ do {
+ fn_4way(ctx, wdst, wsrc);
+
+ wsrc += bsize * 4;
+ wdst += bsize * 4;
+ nbytes -= bsize * 4;
+ } while (nbytes >= bsize * 4);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ fn(ctx, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way);
+}
+
+static unsigned int __cbc_encrypt(struct bf_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct bf_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[4 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 4) {
+ do {
+ nbytes -= bsize * 4 - bsize;
+ src -= 4 - 1;
+ dst -= 4 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+ ivs[2] = src[2];
+
+ blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+ dst[3] ^= ivs[2];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 4);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[BF_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ blowfish_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, BF_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk)
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[4];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 4) {
+ do {
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ }
+
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+ ctrblocks[3] = cpu_to_be64(ctrblk++);
+
+ blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
+ (u8 *)ctrblocks);
+
+ src += 4;
+ dst += 4;
+ } while ((nbytes -= bsize * 4) >= bsize * 4);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ if (dst != src)
+ *dst = *src;
+
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ if (nbytes) {
+ ctr_crypt_final(ctx, &walk);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+
+static struct crypto_alg bf_cipher_alg = {
+ .cra_name = "blowfish",
+ .cra_driver_name = "blowfish-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = BF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct bf_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = BF_MIN_KEY_SIZE,
+ .cia_max_keysize = BF_MAX_KEY_SIZE,
+ .cia_setkey = blowfish_setkey,
+ .cia_encrypt = blowfish_encrypt,
+ .cia_decrypt = blowfish_decrypt,
+ }
+ }
+};
+
+static struct skcipher_alg bf_skcipher_algs[] = {
+ {
+ .base.cra_name = "ecb(blowfish)",
+ .base.cra_driver_name = "ecb-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = BF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(blowfish)",
+ .base.cra_driver_name = "cbc-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = BF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .ivsize = BF_BLOCK_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(blowfish)",
+ .base.cra_driver_name = "ctr-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .ivsize = BF_BLOCK_SIZE,
+ .chunksize = BF_BLOCK_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, blowfish-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "blowfish-x86_64: performance on this CPU "
+ "would be suboptimal: disabling "
+ "blowfish-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&bf_cipher_alg);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(bf_skcipher_algs,
+ ARRAY_SIZE(bf_skcipher_algs));
+ if (err)
+ crypto_unregister_alg(&bf_cipher_alg);
+
+ return err;
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&bf_cipher_alg);
+ crypto_unregister_skciphers(bf_skcipher_algs,
+ ARRAY_SIZE(bf_skcipher_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("blowfish");
+MODULE_ALIAS_CRYPTO("blowfish-asm");
diff --git a/marvell/linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/marvell/linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 0000000..a14af6e
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,1289 @@
+/*
+ * x86_64/AVX/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/*
+ * Version licensed under 2-clause BSD License is available at:
+ * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+
+/**********************************************************************
+ 16-way camellia
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/*
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+ t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vmovdqa .Linv_shift_row, t4; \
+ vbroadcastss .L0f0f0f0f, t7; \
+ vmovdqa .Lpre_tf_lo_s1, t0; \
+ vmovdqa .Lpre_tf_hi_s1, t1; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t4, x0, x0; \
+ vpshufb t4, x7, x7; \
+ vpshufb t4, x1, x1; \
+ vpshufb t4, x4, x4; \
+ vpshufb t4, x2, x2; \
+ vpshufb t4, x5, x5; \
+ vpshufb t4, x3, x3; \
+ vpshufb t4, x6, x6; \
+ \
+ /* prefilter sboxes 1, 2 and 3 */ \
+ vmovdqa .Lpre_tf_lo_s4, t2; \
+ vmovdqa .Lpre_tf_hi_s4, t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x1, t0, t1, t7, t6); \
+ filter_8bit(x4, t0, t1, t7, t6); \
+ filter_8bit(x2, t0, t1, t7, t6); \
+ filter_8bit(x5, t0, t1, t7, t6); \
+ \
+ /* prefilter sbox 4 */ \
+ vpxor t4, t4, t4; \
+ filter_8bit(x3, t2, t3, t7, t6); \
+ filter_8bit(x6, t2, t3, t7, t6); \
+ \
+ /* AES subbytes + AES shift rows */ \
+ vmovdqa .Lpost_tf_lo_s1, t0; \
+ vmovdqa .Lpost_tf_hi_s1, t1; \
+ vaesenclast t4, x0, x0; \
+ vaesenclast t4, x7, x7; \
+ vaesenclast t4, x1, x1; \
+ vaesenclast t4, x4, x4; \
+ vaesenclast t4, x2, x2; \
+ vaesenclast t4, x5, x5; \
+ vaesenclast t4, x3, x3; \
+ vaesenclast t4, x6, x6; \
+ \
+ /* postfilter sboxes 1 and 4 */ \
+ vmovdqa .Lpost_tf_lo_s3, t2; \
+ vmovdqa .Lpost_tf_hi_s3, t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x3, t0, t1, t7, t6); \
+ filter_8bit(x6, t0, t1, t7, t6); \
+ \
+ /* postfilter sbox 3 */ \
+ vmovdqa .Lpost_tf_lo_s2, t4; \
+ vmovdqa .Lpost_tf_hi_s2, t5; \
+ filter_8bit(x2, t2, t3, t7, t6); \
+ filter_8bit(x5, t2, t3, t7, t6); \
+ \
+ vpxor t6, t6, t6; \
+ vmovq key, t0; \
+ \
+ /* postfilter sbox 2 */ \
+ filter_8bit(x1, t4, t5, t7, t2); \
+ filter_8bit(x4, t4, t5, t7, t2); \
+ \
+ vpsrldq $5, t0, t5; \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpsrldq $3, t0, t3; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t6, t0, t0; \
+ vpshufb t6, t1, t1; \
+ vpshufb t6, t2, t2; \
+ vpshufb t6, t3, t3; \
+ vpshufb t6, t4, t4; \
+ vpsrldq $2, t5, t7; \
+ vpshufb t6, t7, t7; \
+ \
+ /* \
+ * P-function \
+ */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* \
+ * Add key material and result to CD (x becomes new CD) \
+ */ \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 16(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 16(mem_cd), x5, x5; \
+ \
+ vpsrldq $1, t5, t3; \
+ vpshufb t6, t5, t5; \
+ vpshufb t6, t3, t6; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 16(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 16(mem_cd), x7, x7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 16(mem_cd), x0, x0; \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 16(mem_cd), x1, x1; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 16(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * Size optimization... with inlined roundsm16, binary would be over 5 times
+ * larger and would only be 0.5% faster (on sandy-bridge).
+ */
+.align 8
+roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+ roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+ %rcx, (%r9));
+ ret;
+ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+ roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
+ %rax, (%r9));
+ ret;
+ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ leaq (key_table + (i) * 8)(CTX), %r9; \
+ call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+ \
+ vmovdqu x4, 0 * 16(mem_cd); \
+ vmovdqu x5, 1 * 16(mem_cd); \
+ vmovdqu x6, 2 * 16(mem_cd); \
+ vmovdqu x7, 3 * 16(mem_cd); \
+ vmovdqu x0, 4 * 16(mem_cd); \
+ vmovdqu x1, 5 * 16(mem_cd); \
+ vmovdqu x2, 6 * 16(mem_cd); \
+ vmovdqu x3, 7 * 16(mem_cd); \
+ \
+ leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+ call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+ vpcmpgtb v0, zero, t0; \
+ vpaddb v0, v0, v0; \
+ vpabsb t0, t0; \
+ \
+ vpcmpgtb v1, zero, t1; \
+ vpaddb v1, v1, v1; \
+ vpabsb t1, t1; \
+ \
+ vpcmpgtb v2, zero, t2; \
+ vpaddb v2, v2, v2; \
+ vpabsb t2, t2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vpcmpgtb v3, zero, t0; \
+ vpaddb v3, v3, v3; \
+ vpabsb t0, t0; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpxor tt0, tt0, tt0; \
+ vmovd kll, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand l0, t0, t0; \
+ vpand l1, t1, t1; \
+ vpand l2, t2, t2; \
+ vpand l3, t3, t3; \
+ \
+ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor l4, t0, l4; \
+ vmovdqu l4, 4 * 16(l); \
+ vpxor l5, t1, l5; \
+ vmovdqu l5, 5 * 16(l); \
+ vpxor l6, t2, l6; \
+ vmovdqu l6, 6 * 16(l); \
+ vpxor l7, t3, l7; \
+ vmovdqu l7, 7 * 16(l); \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vmovd krr, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor 4 * 16(r), t0, t0; \
+ vpor 5 * 16(r), t1, t1; \
+ vpor 6 * 16(r), t2, t2; \
+ vpor 7 * 16(r), t3, t3; \
+ \
+ vpxor 0 * 16(r), t0, t0; \
+ vpxor 1 * 16(r), t1, t1; \
+ vpxor 2 * 16(r), t2, t2; \
+ vpxor 3 * 16(r), t3, t3; \
+ vmovdqu t0, 0 * 16(r); \
+ vmovdqu t1, 1 * 16(r); \
+ vmovdqu t2, 2 * 16(r); \
+ vmovdqu t3, 3 * 16(r); \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vmovd krl, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand 0 * 16(r), t0, t0; \
+ vpand 1 * 16(r), t1, t1; \
+ vpand 2 * 16(r), t2, t2; \
+ vpand 3 * 16(r), t3, t3; \
+ \
+ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor 4 * 16(r), t0, t0; \
+ vpxor 5 * 16(r), t1, t1; \
+ vpxor 6 * 16(r), t2, t2; \
+ vpxor 7 * 16(r), t3, t3; \
+ vmovdqu t0, 4 * 16(r); \
+ vmovdqu t1, 5 * 16(r); \
+ vmovdqu t2, 6 * 16(r); \
+ vmovdqu t3, 7 * 16(r); \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vmovd klr, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor l4, t0, t0; \
+ vpor l5, t1, t1; \
+ vpor l6, t2, t2; \
+ vpor l7, t3, t3; \
+ \
+ vpxor l0, t0, l0; \
+ vmovdqu l0, 0 * 16(l); \
+ vpxor l1, t1, l1; \
+ vmovdqu l1, 1 * 16(l); \
+ vpxor l2, t2, l2; \
+ vmovdqu l2, 2 * 16(l); \
+ vpxor l3, t3, l3; \
+ vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
+ b3, c3, d3, st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vmovq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor 0 * 16(rio), x0, y7; \
+ vpxor 1 * 16(rio), x0, y6; \
+ vpxor 2 * 16(rio), x0, y5; \
+ vpxor 3 * 16(rio), x0, y4; \
+ vpxor 4 * 16(rio), x0, y3; \
+ vpxor 5 * 16(rio), x0, y2; \
+ vpxor 6 * 16(rio), x0, y1; \
+ vpxor 7 * 16(rio), x0, y0; \
+ vpxor 8 * 16(rio), x0, x7; \
+ vpxor 9 * 16(rio), x0, x6; \
+ vpxor 10 * 16(rio), x0, x5; \
+ vpxor 11 * 16(rio), x0, x4; \
+ vpxor 12 * 16(rio), x0, x3; \
+ vpxor 13 * 16(rio), x0, x2; \
+ vpxor 14 * 16(rio), x0, x1; \
+ vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 0 * 16(mem_cd); \
+ vmovdqu y1, 1 * 16(mem_cd); \
+ vmovdqu y2, 2 * 16(mem_cd); \
+ vmovdqu y3, 3 * 16(mem_cd); \
+ vmovdqu y4, 4 * 16(mem_cd); \
+ vmovdqu y5, 5 * 16(mem_cd); \
+ vmovdqu y6, 6 * 16(mem_cd); \
+ vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+ byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
+ y7, x3, x7, stack_tmp0, stack_tmp1); \
+ \
+ vmovdqu x0, stack_tmp0; \
+ \
+ vmovq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor x0, y7, y7; \
+ vpxor x0, y6, y6; \
+ vpxor x0, y5, y5; \
+ vpxor x0, y4, y4; \
+ vpxor x0, y3, y3; \
+ vpxor x0, y2, y2; \
+ vpxor x0, y1, y1; \
+ vpxor x0, y0, y0; \
+ vpxor x0, x7, x7; \
+ vpxor x0, x6, x6; \
+ vpxor x0, x5, x5; \
+ vpxor x0, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x0, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu x0, 0 * 16(rio); \
+ vmovdqu x1, 1 * 16(rio); \
+ vmovdqu x2, 2 * 16(rio); \
+ vmovdqu x3, 3 * 16(rio); \
+ vmovdqu x4, 4 * 16(rio); \
+ vmovdqu x5, 5 * 16(rio); \
+ vmovdqu x6, 6 * 16(rio); \
+ vmovdqu x7, 7 * 16(rio); \
+ vmovdqu y0, 8 * 16(rio); \
+ vmovdqu y1, 9 * 16(rio); \
+ vmovdqu y2, 10 * 16(rio); \
+ vmovdqu y3, 11 * 16(rio); \
+ vmovdqu y4, 12 * 16(rio); \
+ vmovdqu y5, 13 * 16(rio); \
+ vmovdqu y6, 14 * 16(rio); \
+ vmovdqu y7, 15 * 16(rio);
+
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x80808080
+ .long 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For XTS mode IV generation */
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+ .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+ .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+ .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+ .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in <<< 1)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+ .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+ .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+ .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+ .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+ .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+ .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+ .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+ .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+ .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+ .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+ .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+ .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+ .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+ .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+ .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+ .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+.align 8
+__camellia_enc_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 256 bytes
+ * %xmm0..%xmm15: 16 plaintext blocks
+ * output:
+ * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 16(%rax), %rcx;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx);
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 0);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX),
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 8);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX),
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 16);
+
+ movl $24, %r8d;
+ cmpl $16, key_length(CTX);
+ jne .Lenc_max32;
+
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu 0 * 16(%rcx), %xmm8;
+ vmovdqu 1 * 16(%rcx), %xmm9;
+ vmovdqu 2 * 16(%rcx), %xmm10;
+ vmovdqu 3 * 16(%rcx), %xmm11;
+ vmovdqu 4 * 16(%rcx), %xmm12;
+ vmovdqu 5 * 16(%rcx), %xmm13;
+ vmovdqu 6 * 16(%rcx), %xmm14;
+ vmovdqu 7 * 16(%rcx), %xmm15;
+
+ outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Lenc_max32:
+ movl $32, %r8d;
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX),
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 24);
+
+ jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk16)
+
+.align 8
+__camellia_dec_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 256 bytes
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %xmm0..%xmm15: 16 encrypted blocks
+ * output:
+ * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 16(%rax), %rcx;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx);
+
+ cmpl $32, %r8d;
+ je .Ldec_max32;
+
+.Ldec_max24:
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 16);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX),
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX));
+
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 8);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX),
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX));
+
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 0);
+
+ /* load CD for output */
+ vmovdqu 0 * 16(%rcx), %xmm8;
+ vmovdqu 1 * 16(%rcx), %xmm9;
+ vmovdqu 2 * 16(%rcx), %xmm10;
+ vmovdqu 3 * 16(%rcx), %xmm11;
+ vmovdqu 4 * 16(%rcx), %xmm12;
+ vmovdqu 5 * 16(%rcx), %xmm13;
+ vmovdqu 6 * 16(%rcx), %xmm14;
+ vmovdqu 7 * 16(%rcx), %xmm15;
+
+ outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Ldec_max32:
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 24);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX),
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX));
+
+ jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk16)
+
+ENTRY(camellia_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_enc_blk16;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_enc_16way)
+
+ENTRY(camellia_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_dec_blk16;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_dec_16way)
+
+ENTRY(camellia_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /*
+ * dst might still be in-use (in case dst == src), so use stack for
+ * temporary storage.
+ */
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ call __camellia_dec_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+ vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+ vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+ vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+ vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+ vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+ vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+ vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+ vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+ vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+ vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+ vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+ vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+ vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+ vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_cbc_dec_16way)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+ENTRY(camellia_ctr_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ vmovdqa .Lbswap128_mask, %xmm14;
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), %xmm0;
+ vpshufb %xmm14, %xmm0, %xmm15;
+ vmovdqu %xmm15, 15 * 16(%rax);
+
+ vpcmpeqd %xmm15, %xmm15, %xmm15;
+ vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+ /* construct IVs */
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm13;
+ vmovdqu %xmm13, 14 * 16(%rax);
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm13;
+ vmovdqu %xmm13, 13 * 16(%rax);
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm12;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm11;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm10;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm9;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm8;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm7;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm6;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm5;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm4;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm3;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm2;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm1;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vmovdqa %xmm0, %xmm13;
+ vpshufb %xmm14, %xmm0, %xmm0;
+ inc_le128(%xmm13, %xmm15, %xmm14);
+ vmovdqu %xmm13, (%rcx);
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm15;
+ vpshufb .Lpack_bswap, %xmm15, %xmm15;
+ vpxor %xmm0, %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor 13 * 16(%rax), %xmm15, %xmm13;
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ call __camellia_enc_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+ vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+ vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+ vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+ vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+ vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+ vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+ vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+ vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+ vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+ vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+ vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+ vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+ vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+ vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ctr_16way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+.align 8
+camellia_xts_crypt_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ * %r8: index for input whitening key
+ * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
+ */
+ FRAME_BEGIN
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
+
+ /* load IV */
+ vmovdqu (%rcx), %xmm0;
+ vpxor 0 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 15 * 16(%rax);
+ vmovdqu %xmm0, 0 * 16(%rsi);
+
+ /* construct IVs */
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 1 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 14 * 16(%rax);
+ vmovdqu %xmm0, 1 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 2 * 16(%rdx), %xmm0, %xmm13;
+ vmovdqu %xmm0, 2 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 3 * 16(%rdx), %xmm0, %xmm12;
+ vmovdqu %xmm0, 3 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 4 * 16(%rdx), %xmm0, %xmm11;
+ vmovdqu %xmm0, 4 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 5 * 16(%rdx), %xmm0, %xmm10;
+ vmovdqu %xmm0, 5 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 6 * 16(%rdx), %xmm0, %xmm9;
+ vmovdqu %xmm0, 6 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 7 * 16(%rdx), %xmm0, %xmm8;
+ vmovdqu %xmm0, 7 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 8 * 16(%rdx), %xmm0, %xmm7;
+ vmovdqu %xmm0, 8 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 9 * 16(%rdx), %xmm0, %xmm6;
+ vmovdqu %xmm0, 9 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 10 * 16(%rdx), %xmm0, %xmm5;
+ vmovdqu %xmm0, 10 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 11 * 16(%rdx), %xmm0, %xmm4;
+ vmovdqu %xmm0, 11 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 12 * 16(%rdx), %xmm0, %xmm3;
+ vmovdqu %xmm0, 12 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 13 * 16(%rdx), %xmm0, %xmm2;
+ vmovdqu %xmm0, 13 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 14 * 16(%rdx), %xmm0, %xmm1;
+ vmovdqu %xmm0, 14 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 15 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 0 * 16(%rax);
+ vmovdqu %xmm0, 15 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vmovdqu %xmm0, (%rcx);
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX, %r8, 8), %xmm15;
+ vpshufb .Lpack_bswap, %xmm15, %xmm15;
+ vpxor 0 * 16(%rax), %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor %xmm13, %xmm15, %xmm13;
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ CALL_NOSPEC %r9;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+ vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+ vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+ vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+ vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+ vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+ vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+ vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+ vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+ vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+ vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+ vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+ vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+ vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+ vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+ vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_xts_crypt_16way)
+
+ENTRY(camellia_xts_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+ leaq __camellia_enc_blk16, %r9;
+
+ jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_enc_16way)
+
+ENTRY(camellia_xts_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* input whitening key, last for dec */
+
+ leaq __camellia_dec_blk16, %r9;
+
+ jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_dec_16way)
diff --git a/marvell/linux/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/marvell/linux/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
new file mode 100644
index 0000000..4be4c7c
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -0,0 +1,1403 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/**********************************************************************
+ 32-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+ t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vbroadcasti128 .Linv_shift_row, t4; \
+ vpbroadcastd .L0f0f0f0f, t7; \
+ vbroadcasti128 .Lpre_tf_lo_s1, t5; \
+ vbroadcasti128 .Lpre_tf_hi_s1, t6; \
+ vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+ vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t4, x0, x0; \
+ vpshufb t4, x7, x7; \
+ vpshufb t4, x3, x3; \
+ vpshufb t4, x6, x6; \
+ vpshufb t4, x2, x2; \
+ vpshufb t4, x5, x5; \
+ vpshufb t4, x1, x1; \
+ vpshufb t4, x4, x4; \
+ \
+ /* prefilter sboxes 1, 2 and 3 */ \
+ /* prefilter sbox 4 */ \
+ filter_8bit(x0, t5, t6, t7, t4); \
+ filter_8bit(x7, t5, t6, t7, t4); \
+ vextracti128 $1, x0, t0##_x; \
+ vextracti128 $1, x7, t1##_x; \
+ filter_8bit(x3, t2, t3, t7, t4); \
+ filter_8bit(x6, t2, t3, t7, t4); \
+ vextracti128 $1, x3, t3##_x; \
+ vextracti128 $1, x6, t2##_x; \
+ filter_8bit(x2, t5, t6, t7, t4); \
+ filter_8bit(x5, t5, t6, t7, t4); \
+ filter_8bit(x1, t5, t6, t7, t4); \
+ filter_8bit(x4, t5, t6, t7, t4); \
+ \
+ vpxor t4##_x, t4##_x, t4##_x; \
+ \
+ /* AES subbytes + AES shift rows */ \
+ vextracti128 $1, x2, t6##_x; \
+ vextracti128 $1, x5, t5##_x; \
+ vaesenclast t4##_x, x0##_x, x0##_x; \
+ vaesenclast t4##_x, t0##_x, t0##_x; \
+ vinserti128 $1, t0##_x, x0, x0; \
+ vaesenclast t4##_x, x7##_x, x7##_x; \
+ vaesenclast t4##_x, t1##_x, t1##_x; \
+ vinserti128 $1, t1##_x, x7, x7; \
+ vaesenclast t4##_x, x3##_x, x3##_x; \
+ vaesenclast t4##_x, t3##_x, t3##_x; \
+ vinserti128 $1, t3##_x, x3, x3; \
+ vaesenclast t4##_x, x6##_x, x6##_x; \
+ vaesenclast t4##_x, t2##_x, t2##_x; \
+ vinserti128 $1, t2##_x, x6, x6; \
+ vextracti128 $1, x1, t3##_x; \
+ vextracti128 $1, x4, t2##_x; \
+ vbroadcasti128 .Lpost_tf_lo_s1, t0; \
+ vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+ vaesenclast t4##_x, x2##_x, x2##_x; \
+ vaesenclast t4##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x2, x2; \
+ vaesenclast t4##_x, x5##_x, x5##_x; \
+ vaesenclast t4##_x, t5##_x, t5##_x; \
+ vinserti128 $1, t5##_x, x5, x5; \
+ vaesenclast t4##_x, x1##_x, x1##_x; \
+ vaesenclast t4##_x, t3##_x, t3##_x; \
+ vinserti128 $1, t3##_x, x1, x1; \
+ vaesenclast t4##_x, x4##_x, x4##_x; \
+ vaesenclast t4##_x, t2##_x, t2##_x; \
+ vinserti128 $1, t2##_x, x4, x4; \
+ \
+ /* postfilter sboxes 1 and 4 */ \
+ vbroadcasti128 .Lpost_tf_lo_s3, t2; \
+ vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x3, t0, t1, t7, t6); \
+ filter_8bit(x6, t0, t1, t7, t6); \
+ \
+ /* postfilter sbox 3 */ \
+ vbroadcasti128 .Lpost_tf_lo_s2, t4; \
+ vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+ filter_8bit(x2, t2, t3, t7, t6); \
+ filter_8bit(x5, t2, t3, t7, t6); \
+ \
+ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+ \
+ /* postfilter sbox 2 */ \
+ filter_8bit(x1, t4, t5, t7, t2); \
+ filter_8bit(x4, t4, t5, t7, t2); \
+ vpxor t7, t7, t7; \
+ \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpshufb t7, t1, t1; \
+ vpsrldq $3, t0, t3; \
+ \
+ /* P-function */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpshufb t7, t2, t2; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t7, t3, t3; \
+ vpsrldq $5, t0, t5; \
+ vpshufb t7, t4, t4; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpsrldq $6, t0, t6; \
+ vpshufb t7, t5, t5; \
+ vpshufb t7, t6, t6; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* Add key material and result to CD (x becomes new CD) */ \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 32(mem_cd), x1, x1; \
+ \
+ vpsrldq $7, t0, t6; \
+ vpshufb t7, t0, t0; \
+ vpshufb t7, t6, t7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 32(mem_cd), x0, x0; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 32(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 32(mem_cd), x3, x3; \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 32(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 32(mem_cd), x5, x5; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 32(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 32(mem_cd), x7, x7;
+
+/*
+ * Size optimization... with inlined roundsm32 binary would be over 5 times
+ * larger and would only marginally faster.
+ */
+.align 8
+roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+ roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+ %rcx, (%r9));
+ ret;
+ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+ roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
+ %rax, (%r9));
+ ret;
+ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ leaq (key_table + (i) * 8)(CTX), %r9; \
+ call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+ \
+ vmovdqu x0, 4 * 32(mem_cd); \
+ vmovdqu x1, 5 * 32(mem_cd); \
+ vmovdqu x2, 6 * 32(mem_cd); \
+ vmovdqu x3, 7 * 32(mem_cd); \
+ vmovdqu x4, 0 * 32(mem_cd); \
+ vmovdqu x5, 1 * 32(mem_cd); \
+ vmovdqu x6, 2 * 32(mem_cd); \
+ vmovdqu x7, 3 * 32(mem_cd); \
+ \
+ leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+ call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab);
+
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+ vpcmpgtb v0, zero, t0; \
+ vpaddb v0, v0, v0; \
+ vpabsb t0, t0; \
+ \
+ vpcmpgtb v1, zero, t1; \
+ vpaddb v1, v1, v1; \
+ vpabsb t1, t1; \
+ \
+ vpcmpgtb v2, zero, t2; \
+ vpaddb v2, v2, v2; \
+ vpabsb t2, t2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vpcmpgtb v3, zero, t0; \
+ vpaddb v3, v3, v3; \
+ vpabsb t0, t0; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+ vpxor tt0, tt0, tt0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand l0, t0, t0; \
+ vpand l1, t1, t1; \
+ vpand l2, t2, t2; \
+ vpand l3, t3, t3; \
+ \
+ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor l4, t0, l4; \
+ vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+ vmovdqu l4, 4 * 32(l); \
+ vpxor l5, t1, l5; \
+ vmovdqu l5, 5 * 32(l); \
+ vpxor l6, t2, l6; \
+ vmovdqu l6, 6 * 32(l); \
+ vpxor l7, t3, l7; \
+ vmovdqu l7, 7 * 32(l); \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor 4 * 32(r), t0, t0; \
+ vpor 5 * 32(r), t1, t1; \
+ vpor 6 * 32(r), t2, t2; \
+ vpor 7 * 32(r), t3, t3; \
+ \
+ vpxor 0 * 32(r), t0, t0; \
+ vpxor 1 * 32(r), t1, t1; \
+ vpxor 2 * 32(r), t2, t2; \
+ vpxor 3 * 32(r), t3, t3; \
+ vmovdqu t0, 0 * 32(r); \
+ vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+ vmovdqu t1, 1 * 32(r); \
+ vmovdqu t2, 2 * 32(r); \
+ vmovdqu t3, 3 * 32(r); \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand 0 * 32(r), t0, t0; \
+ vpand 1 * 32(r), t1, t1; \
+ vpand 2 * 32(r), t2, t2; \
+ vpand 3 * 32(r), t3, t3; \
+ \
+ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor 4 * 32(r), t0, t0; \
+ vpxor 5 * 32(r), t1, t1; \
+ vpxor 6 * 32(r), t2, t2; \
+ vpxor 7 * 32(r), t3, t3; \
+ vmovdqu t0, 4 * 32(r); \
+ vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+ vmovdqu t1, 5 * 32(r); \
+ vmovdqu t2, 6 * 32(r); \
+ vmovdqu t3, 7 * 32(r); \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor l4, t0, t0; \
+ vpor l5, t1, t1; \
+ vpor l6, t2, t2; \
+ vpor l7, t3, t3; \
+ \
+ vpxor l0, t0, l0; \
+ vmovdqu l0, 0 * 32(l); \
+ vpxor l1, t1, l1; \
+ vmovdqu l1, 1 * 32(l); \
+ vpxor l2, t2, l2; \
+ vmovdqu l2, 2 * 32(l); \
+ vpxor l3, t3, l3; \
+ vmovdqu l3, 3 * 32(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+ a3, b3, c3, d3, st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor 0 * 32(rio), x0, y7; \
+ vpxor 1 * 32(rio), x0, y6; \
+ vpxor 2 * 32(rio), x0, y5; \
+ vpxor 3 * 32(rio), x0, y4; \
+ vpxor 4 * 32(rio), x0, y3; \
+ vpxor 5 * 32(rio), x0, y2; \
+ vpxor 6 * 32(rio), x0, y1; \
+ vpxor 7 * 32(rio), x0, y0; \
+ vpxor 8 * 32(rio), x0, x7; \
+ vpxor 9 * 32(rio), x0, x6; \
+ vpxor 10 * 32(rio), x0, x5; \
+ vpxor 11 * 32(rio), x0, x4; \
+ vpxor 12 * 32(rio), x0, x3; \
+ vpxor 13 * 32(rio), x0, x2; \
+ vpxor 14 * 32(rio), x0, x1; \
+ vpxor 15 * 32(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd) \
+ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+ y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab); \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu y0, 0 * 32(mem_cd); \
+ vmovdqu y1, 1 * 32(mem_cd); \
+ vmovdqu y2, 2 * 32(mem_cd); \
+ vmovdqu y3, 3 * 32(mem_cd); \
+ vmovdqu y4, 4 * 32(mem_cd); \
+ vmovdqu y5, 5 * 32(mem_cd); \
+ vmovdqu y6, 6 * 32(mem_cd); \
+ vmovdqu y7, 7 * 32(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+ y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+ \
+ vmovdqu x0, stack_tmp0; \
+ \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor x0, y7, y7; \
+ vpxor x0, y6, y6; \
+ vpxor x0, y5, y5; \
+ vpxor x0, y4, y4; \
+ vpxor x0, y3, y3; \
+ vpxor x0, y2, y2; \
+ vpxor x0, y1, y1; \
+ vpxor x0, y0, y0; \
+ vpxor x0, x7, x7; \
+ vpxor x0, x6, x6; \
+ vpxor x0, x5, x5; \
+ vpxor x0, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x0, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu x0, 0 * 32(rio); \
+ vmovdqu x1, 1 * 32(rio); \
+ vmovdqu x2, 2 * 32(rio); \
+ vmovdqu x3, 3 * 32(rio); \
+ vmovdqu x4, 4 * 32(rio); \
+ vmovdqu x5, 5 * 32(rio); \
+ vmovdqu x6, 6 * 32(rio); \
+ vmovdqu x7, 7 * 32(rio); \
+ vmovdqu y0, 8 * 32(rio); \
+ vmovdqu y1, 9 * 32(rio); \
+ vmovdqu y2, 10 * 32(rio); \
+ vmovdqu y3, 11 * 32(rio); \
+ vmovdqu y4, 12 * 32(rio); \
+ vmovdqu y5, 13 * 32(rio); \
+ vmovdqu y6, 14 * 32(rio); \
+ vmovdqu y7, 15 * 32(rio);
+
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst32.pack_bswap, "aM", @progbits, 32
+.align 32
+.Lpack_bswap:
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For XTS mode */
+.Lxts_gf128mul_and_shl1_mask_0:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+ .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+ .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+ .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+ .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+ .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in <<< 1)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+ .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+ .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+ .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+ .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+ .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+ .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+ .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+ .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+ .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+ .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+ .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+ .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+ .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+ .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+ .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+ .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+.align 8
+__camellia_enc_blk32:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 512 bytes
+ * %ymm0..%ymm15: 32 plaintext blocks
+ * output:
+ * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 32(%rax), %rcx;
+
+ inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx);
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 0);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX),
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 8);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX),
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 16);
+
+ movl $24, %r8d;
+ cmpl $16, key_length(CTX);
+ jne .Lenc_max32;
+
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu 0 * 32(%rcx), %ymm8;
+ vmovdqu 1 * 32(%rcx), %ymm9;
+ vmovdqu 2 * 32(%rcx), %ymm10;
+ vmovdqu 3 * 32(%rcx), %ymm11;
+ vmovdqu 4 * 32(%rcx), %ymm12;
+ vmovdqu 5 * 32(%rcx), %ymm13;
+ vmovdqu 6 * 32(%rcx), %ymm14;
+ vmovdqu 7 * 32(%rcx), %ymm15;
+
+ outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Lenc_max32:
+ movl $32, %r8d;
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX),
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 24);
+
+ jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk32)
+
+.align 8
+__camellia_dec_blk32:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 512 bytes
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %ymm0..%ymm15: 16 encrypted blocks
+ * output:
+ * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 32(%rax), %rcx;
+
+ inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx);
+
+ cmpl $32, %r8d;
+ je .Ldec_max32;
+
+.Ldec_max24:
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 16);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX),
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX));
+
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 8);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX),
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX));
+
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 0);
+
+ /* load CD for output */
+ vmovdqu 0 * 32(%rcx), %ymm8;
+ vmovdqu 1 * 32(%rcx), %ymm9;
+ vmovdqu 2 * 32(%rcx), %ymm10;
+ vmovdqu 3 * 32(%rcx), %ymm11;
+ vmovdqu 4 * 32(%rcx), %ymm12;
+ vmovdqu 5 * 32(%rcx), %ymm13;
+ vmovdqu 6 * 32(%rcx), %ymm14;
+ vmovdqu 7 * 32(%rcx), %ymm15;
+
+ outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Ldec_max32:
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 24);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX),
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX));
+
+ jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk32)
+
+ENTRY(camellia_ecb_enc_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_enc_blk32;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_enc_32way)
+
+ENTRY(camellia_ecb_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_dec_blk32;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_dec_32way)
+
+ENTRY(camellia_cbc_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ movq %rsp, %r10;
+ cmpq %rsi, %rdx;
+ je .Lcbc_dec_use_stack;
+
+ /* dst can be used as temporary storage, src is not overwritten. */
+ movq %rsi, %rax;
+ jmp .Lcbc_dec_continue;
+
+.Lcbc_dec_use_stack:
+ /*
+ * dst still in-use (because dst == src), so use stack for temporary
+ * storage.
+ */
+ subq $(16 * 32), %rsp;
+ movq %rsp, %rax;
+
+.Lcbc_dec_continue:
+ call __camellia_dec_blk32;
+
+ vmovdqu %ymm7, (%rax);
+ vpxor %ymm7, %ymm7, %ymm7;
+ vinserti128 $1, (%rdx), %ymm7, %ymm7;
+ vpxor (%rax), %ymm7, %ymm7;
+ movq %r10, %rsp;
+ vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+ vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+ vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+ vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+ vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+ vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+ vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+ vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+ vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+ vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+ vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+ vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+ vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+ vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+ vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_cbc_dec_32way)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+ vpcmpeqq minus_one, x, tmp1; \
+ vpcmpeqq minus_two, x, tmp2; \
+ vpsubq minus_two, x, x; \
+ vpor tmp2, tmp1, tmp1; \
+ vpslldq $8, tmp1, tmp1; \
+ vpsubq tmp1, x, x;
+
+ENTRY(camellia_ctr_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ movq %rsp, %r10;
+ cmpq %rsi, %rdx;
+ je .Lctr_use_stack;
+
+ /* dst can be used as temporary storage, src is not overwritten. */
+ movq %rsi, %rax;
+ jmp .Lctr_continue;
+
+.Lctr_use_stack:
+ subq $(16 * 32), %rsp;
+ movq %rsp, %rax;
+
+.Lctr_continue:
+ vpcmpeqd %ymm15, %ymm15, %ymm15;
+ vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), %xmm0;
+ vmovdqa %xmm0, %xmm1;
+ inc_le128(%xmm0, %xmm15, %xmm14);
+ vbroadcasti128 .Lbswap128_mask, %ymm14;
+ vinserti128 $1, %xmm0, %ymm1, %ymm0;
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 15 * 32(%rax);
+
+ /* construct IVs */
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 14 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 13 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 12 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 11 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm10;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm9;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm8;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm7;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm6;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm5;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm4;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm3;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm2;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm1;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vextracti128 $1, %ymm0, %xmm13;
+ vpshufb %ymm14, %ymm0, %ymm0;
+ inc_le128(%xmm13, %xmm15, %xmm14);
+ vmovdqu %xmm13, (%rcx);
+
+ /* inpack32_pre: */
+ vpbroadcastq (key_table)(CTX), %ymm15;
+ vpshufb .Lpack_bswap, %ymm15, %ymm15;
+ vpxor %ymm0, %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor 11 * 32(%rax), %ymm15, %ymm11;
+ vpxor 12 * 32(%rax), %ymm15, %ymm12;
+ vpxor 13 * 32(%rax), %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ call __camellia_enc_blk32;
+
+ movq %r10, %rsp;
+
+ vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+ vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+ vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+ vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+ vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+ vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+ vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+ vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+ vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+ vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+ vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+ vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+ vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+ vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+ vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+ vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ctr_32way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+ vpsrad $31, iv, tmp0; \
+ vpaddq iv, iv, tmp1; \
+ vpsllq $2, iv, iv; \
+ vpshufd $0x13, tmp0, tmp0; \
+ vpsrad $31, tmp1, tmp1; \
+ vpand mask2, tmp0, tmp0; \
+ vpshufd $0x13, tmp1, tmp1; \
+ vpxor tmp0, iv, iv; \
+ vpand mask1, tmp1, tmp1; \
+ vpxor tmp1, iv, iv;
+
+.align 8
+camellia_xts_crypt_32way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ * %r8: index for input whitening key
+ * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ subq $(16 * 32), %rsp;
+ movq %rsp, %rax;
+
+ vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
+
+ /* load IV and construct second IV */
+ vmovdqu (%rcx), %xmm0;
+ vmovdqa %xmm0, %xmm15;
+ gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
+ vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
+ vinserti128 $1, %xmm0, %ymm15, %ymm0;
+ vpxor 0 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 15 * 32(%rax);
+ vmovdqu %ymm0, 0 * 32(%rsi);
+
+ /* construct IVs */
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 1 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 14 * 32(%rax);
+ vmovdqu %ymm0, 1 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 2 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 13 * 32(%rax);
+ vmovdqu %ymm0, 2 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 3 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 12 * 32(%rax);
+ vmovdqu %ymm0, 3 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 4 * 32(%rdx), %ymm0, %ymm11;
+ vmovdqu %ymm0, 4 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 5 * 32(%rdx), %ymm0, %ymm10;
+ vmovdqu %ymm0, 5 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 6 * 32(%rdx), %ymm0, %ymm9;
+ vmovdqu %ymm0, 6 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 7 * 32(%rdx), %ymm0, %ymm8;
+ vmovdqu %ymm0, 7 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 8 * 32(%rdx), %ymm0, %ymm7;
+ vmovdqu %ymm0, 8 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 9 * 32(%rdx), %ymm0, %ymm6;
+ vmovdqu %ymm0, 9 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 10 * 32(%rdx), %ymm0, %ymm5;
+ vmovdqu %ymm0, 10 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 11 * 32(%rdx), %ymm0, %ymm4;
+ vmovdqu %ymm0, 11 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 12 * 32(%rdx), %ymm0, %ymm3;
+ vmovdqu %ymm0, 12 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 13 * 32(%rdx), %ymm0, %ymm2;
+ vmovdqu %ymm0, 13 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 14 * 32(%rdx), %ymm0, %ymm1;
+ vmovdqu %ymm0, 14 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 15 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 0 * 32(%rax);
+ vmovdqu %ymm0, 15 * 32(%rsi);
+
+ vextracti128 $1, %ymm0, %xmm0;
+ gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
+ vmovdqu %xmm0, (%rcx);
+
+ /* inpack32_pre: */
+ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+ vpshufb .Lpack_bswap, %ymm15, %ymm15;
+ vpxor 0 * 32(%rax), %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor %ymm11, %ymm15, %ymm11;
+ vpxor 12 * 32(%rax), %ymm15, %ymm12;
+ vpxor 13 * 32(%rax), %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ CALL_NOSPEC %r9;
+
+ addq $(16 * 32), %rsp;
+
+ vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+ vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+ vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+ vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+ vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+ vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+ vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+ vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+ vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+ vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+ vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+ vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+ vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+ vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+ vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+ vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_xts_crypt_32way)
+
+ENTRY(camellia_xts_enc_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+ leaq __camellia_enc_blk32, %r9;
+
+ jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_enc_32way)
+
+ENTRY(camellia_xts_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* input whitening key, last for dec */
+
+ leaq __camellia_dec_blk32, %r9;
+
+ jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_dec_32way)
diff --git a/marvell/linux/arch/x86/crypto/camellia-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/camellia-x86_64-asm_64.S
new file mode 100644
index 0000000..23528bc
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -0,0 +1,499 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Camellia Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+
+.file "camellia-x86_64-asm_64.S"
+.text
+
+.extern camellia_sp10011110;
+.extern camellia_sp22000222;
+.extern camellia_sp03303033;
+.extern camellia_sp00444404;
+.extern camellia_sp02220222;
+.extern camellia_sp30333033;
+.extern camellia_sp44044404;
+.extern camellia_sp11101110;
+
+#define sp10011110 camellia_sp10011110
+#define sp22000222 camellia_sp22000222
+#define sp03303033 camellia_sp03303033
+#define sp00444404 camellia_sp00444404
+#define sp02220222 camellia_sp02220222
+#define sp30333033 camellia_sp30333033
+#define sp44044404 camellia_sp44044404
+#define sp11101110 camellia_sp11101110
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RIOd %esi
+
+#define RAB0 %rax
+#define RCD0 %rcx
+#define RAB1 %rbx
+#define RCD1 %rdx
+
+#define RAB0d %eax
+#define RCD0d %ecx
+#define RAB1d %ebx
+#define RCD1d %edx
+
+#define RAB0bl %al
+#define RCD0bl %cl
+#define RAB1bl %bl
+#define RCD1bl %dl
+
+#define RAB0bh %ah
+#define RCD0bh %ch
+#define RAB1bh %bh
+#define RCD1bh %dh
+
+#define RT0 %rsi
+#define RT1 %r12
+#define RT2 %r8
+
+#define RT0d %esi
+#define RT1d %r12d
+#define RT2d %r8d
+
+#define RT2bl %r8b
+
+#define RXOR %r9
+#define RR12 %r10
+#define RDST %r11
+
+#define RXORd %r9d
+#define RXORbl %r9b
+
+#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+ movzbl ab ## bl, tmp2 ## d; \
+ movzbl ab ## bh, tmp1 ## d; \
+ rorq $16, ab; \
+ xorq T0(, tmp2, 8), dst; \
+ xorq T1(, tmp1, 8), dst;
+
+/**********************************************************************
+ 1-way camellia
+ **********************************************************************/
+#define roundsm(ab, subkey, cd) \
+ movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+ \
+ xorq RT2, cd ## 0;
+
+#define fls(l, r, kl, kr) \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
+ andl l ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, l ## 0; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
+ orq r ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, r ## 0; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
+ orq l ## 0, RT2; \
+ shrq $32, RT2; \
+ xorq RT2, l ## 0; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
+ andl r ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, r ## 0;
+
+#define enc_rounds(i) \
+ roundsm(RAB, i + 2, RCD); \
+ roundsm(RCD, i + 3, RAB); \
+ roundsm(RAB, i + 4, RCD); \
+ roundsm(RCD, i + 5, RAB); \
+ roundsm(RAB, i + 6, RCD); \
+ roundsm(RCD, i + 7, RAB);
+
+#define enc_fls(i) \
+ fls(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack() \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rolq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rorq $32, RCD0; \
+ xorq key_table(CTX), RAB0;
+
+#define enc_outunpack(op, max) \
+ xorq key_table(CTX, max, 8), RCD0; \
+ rorq $32, RCD0; \
+ bswapq RCD0; \
+ op ## q RCD0, (RIO); \
+ rolq $32, RAB0; \
+ bswapq RAB0; \
+ op ## q RAB0, 4*2(RIO);
+
+#define dec_rounds(i) \
+ roundsm(RAB, i + 7, RCD); \
+ roundsm(RCD, i + 6, RAB); \
+ roundsm(RAB, i + 5, RCD); \
+ roundsm(RCD, i + 4, RAB); \
+ roundsm(RAB, i + 3, RCD); \
+ roundsm(RCD, i + 2, RAB);
+
+#define dec_fls(i) \
+ fls(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack(max) \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rolq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rorq $32, RCD0; \
+ xorq key_table(CTX, max, 8), RAB0;
+
+#define dec_outunpack() \
+ xorq key_table(CTX), RCD0; \
+ rorq $32, RCD0; \
+ bswapq RCD0; \
+ movq RCD0, (RIO); \
+ rolq $32, RAB0; \
+ bswapq RAB0; \
+ movq RAB0, 4*2(RIO);
+
+ENTRY(__camellia_enc_blk)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool xor
+ */
+ movq %r12, RR12;
+
+ movq %rcx, RXOR;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ enc_inpack();
+
+ enc_rounds(0);
+ enc_fls(8);
+ enc_rounds(8);
+ enc_fls(16);
+ enc_rounds(16);
+ movl $24, RT1d; /* max */
+
+ cmpb $16, key_length(CTX);
+ je .L__enc_done;
+
+ enc_fls(24);
+ enc_rounds(24);
+ movl $32, RT1d; /* max */
+
+.L__enc_done:
+ testb RXORbl, RXORbl;
+ movq RDST, RIO;
+
+ jnz .L__enc_xor;
+
+ enc_outunpack(mov, RT1);
+
+ movq RR12, %r12;
+ ret;
+
+.L__enc_xor:
+ enc_outunpack(xor, RT1);
+
+ movq RR12, %r12;
+ ret;
+ENDPROC(__camellia_enc_blk)
+
+ENTRY(camellia_dec_blk)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ cmpl $16, key_length(CTX);
+ movl $32, RT2d;
+ movl $24, RXORd;
+ cmovel RXORd, RT2d; /* max */
+
+ movq %r12, RR12;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ dec_inpack(RT2);
+
+ cmpb $24, RT2bl;
+ je .L__dec_rounds16;
+
+ dec_rounds(24);
+ dec_fls(24);
+
+.L__dec_rounds16:
+ dec_rounds(16);
+ dec_fls(16);
+ dec_rounds(8);
+ dec_fls(8);
+ dec_rounds(0);
+
+ movq RDST, RIO;
+
+ dec_outunpack();
+
+ movq RR12, %r12;
+ ret;
+ENDPROC(camellia_dec_blk)
+
+/**********************************************************************
+ 2-way camellia
+ **********************************************************************/
+#define roundsm2(ab, subkey, cd) \
+ movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
+ xorq RT2, cd ## 1; \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
+ xorq RT2, cd ## 0; \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
+
+#define fls2(l, r, kl, kr) \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
+ andl l ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, l ## 0; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
+ orq r ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, r ## 0; \
+ \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
+ andl l ## 1d, RT2d; \
+ roll $1, RT2d; \
+ shlq $32, RT2; \
+ xorq RT2, l ## 1; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
+ orq r ## 1, RT0; \
+ shrq $32, RT0; \
+ xorq RT0, r ## 1; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
+ orq l ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, l ## 0; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
+ andl r ## 0d, RT2d; \
+ roll $1, RT2d; \
+ shlq $32, RT2; \
+ xorq RT2, r ## 0; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
+ orq l ## 1, RT0; \
+ shrq $32, RT0; \
+ xorq RT0, l ## 1; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
+ andl r ## 1d, RT1d; \
+ roll $1, RT1d; \
+ shlq $32, RT1; \
+ xorq RT1, r ## 1;
+
+#define enc_rounds2(i) \
+ roundsm2(RAB, i + 2, RCD); \
+ roundsm2(RCD, i + 3, RAB); \
+ roundsm2(RAB, i + 4, RCD); \
+ roundsm2(RCD, i + 5, RAB); \
+ roundsm2(RAB, i + 6, RCD); \
+ roundsm2(RCD, i + 7, RAB);
+
+#define enc_fls2(i) \
+ fls2(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack2() \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rorq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rolq $32, RCD0; \
+ xorq key_table(CTX), RAB0; \
+ \
+ movq 8*2(RIO), RAB1; \
+ bswapq RAB1; \
+ rorq $32, RAB1; \
+ movq 12*2(RIO), RCD1; \
+ bswapq RCD1; \
+ rolq $32, RCD1; \
+ xorq key_table(CTX), RAB1;
+
+#define enc_outunpack2(op, max) \
+ xorq key_table(CTX, max, 8), RCD0; \
+ rolq $32, RCD0; \
+ bswapq RCD0; \
+ op ## q RCD0, (RIO); \
+ rorq $32, RAB0; \
+ bswapq RAB0; \
+ op ## q RAB0, 4*2(RIO); \
+ \
+ xorq key_table(CTX, max, 8), RCD1; \
+ rolq $32, RCD1; \
+ bswapq RCD1; \
+ op ## q RCD1, 8*2(RIO); \
+ rorq $32, RAB1; \
+ bswapq RAB1; \
+ op ## q RAB1, 12*2(RIO);
+
+#define dec_rounds2(i) \
+ roundsm2(RAB, i + 7, RCD); \
+ roundsm2(RCD, i + 6, RAB); \
+ roundsm2(RAB, i + 5, RCD); \
+ roundsm2(RCD, i + 4, RAB); \
+ roundsm2(RAB, i + 3, RCD); \
+ roundsm2(RCD, i + 2, RAB);
+
+#define dec_fls2(i) \
+ fls2(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack2(max) \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rorq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rolq $32, RCD0; \
+ xorq key_table(CTX, max, 8), RAB0; \
+ \
+ movq 8*2(RIO), RAB1; \
+ bswapq RAB1; \
+ rorq $32, RAB1; \
+ movq 12*2(RIO), RCD1; \
+ bswapq RCD1; \
+ rolq $32, RCD1; \
+ xorq key_table(CTX, max, 8), RAB1;
+
+#define dec_outunpack2() \
+ xorq key_table(CTX), RCD0; \
+ rolq $32, RCD0; \
+ bswapq RCD0; \
+ movq RCD0, (RIO); \
+ rorq $32, RAB0; \
+ bswapq RAB0; \
+ movq RAB0, 4*2(RIO); \
+ \
+ xorq key_table(CTX), RCD1; \
+ rolq $32, RCD1; \
+ bswapq RCD1; \
+ movq RCD1, 8*2(RIO); \
+ rorq $32, RAB1; \
+ bswapq RAB1; \
+ movq RAB1, 12*2(RIO);
+
+ENTRY(__camellia_enc_blk_2way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool xor
+ */
+ pushq %rbx;
+
+ movq %r12, RR12;
+ movq %rcx, RXOR;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ enc_inpack2();
+
+ enc_rounds2(0);
+ enc_fls2(8);
+ enc_rounds2(8);
+ enc_fls2(16);
+ enc_rounds2(16);
+ movl $24, RT2d; /* max */
+
+ cmpb $16, key_length(CTX);
+ je .L__enc2_done;
+
+ enc_fls2(24);
+ enc_rounds2(24);
+ movl $32, RT2d; /* max */
+
+.L__enc2_done:
+ test RXORbl, RXORbl;
+ movq RDST, RIO;
+ jnz .L__enc2_xor;
+
+ enc_outunpack2(mov, RT2);
+
+ movq RR12, %r12;
+ popq %rbx;
+ ret;
+
+.L__enc2_xor:
+ enc_outunpack2(xor, RT2);
+
+ movq RR12, %r12;
+ popq %rbx;
+ ret;
+ENDPROC(__camellia_enc_blk_2way)
+
+ENTRY(camellia_dec_blk_2way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ cmpl $16, key_length(CTX);
+ movl $32, RT2d;
+ movl $24, RXORd;
+ cmovel RXORd, RT2d; /* max */
+
+ movq %rbx, RXOR;
+ movq %r12, RR12;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ dec_inpack2(RT2);
+
+ cmpb $24, RT2bl;
+ je .L__dec2_rounds16;
+
+ dec_rounds2(24);
+ dec_fls2(24);
+
+.L__dec2_rounds16:
+ dec_rounds2(16);
+ dec_fls2(16);
+ dec_rounds2(8);
+ dec_fls2(8);
+ dec_rounds2(0);
+
+ movq RDST, RIO;
+
+ dec_outunpack2();
+
+ movq RR12, %r12;
+ movq RXOR, %rbx;
+ ret;
+ENDPROC(camellia_dec_blk_2way)
diff --git a/marvell/linux/arch/x86/crypto/camellia_aesni_avx2_glue.c b/marvell/linux/arch/x86/crypto/camellia_aesni_avx2_glue.c
new file mode 100644
index 0000000..a8cc2c8
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2/AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = camellia_ecb_enc_32way }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = camellia_ecb_enc_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = camellia_enc_blk_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = camellia_enc_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = camellia_ctr_32way }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = camellia_ctr_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ctr = camellia_crypt_ctr_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = camellia_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx camellia_enc_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .xts = camellia_xts_enc_32way }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = camellia_xts_enc_16way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = camellia_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = camellia_ecb_dec_32way }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = camellia_ecb_dec_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = camellia_dec_blk_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = camellia_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = camellia_cbc_dec_32way }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = camellia_cbc_dec_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .cbc = camellia_decrypt_cbc_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = camellia_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .xts = camellia_xts_dec_32way }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = camellia_xts_dec_16way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = camellia_xts_dec }
+ } }
+};
+
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
+ &tfm->base.crt_flags);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&camellia_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+}
+
+static struct skcipher_alg camellia_algs[] = {
+ {
+ .base.cra_name = "__ecb(camellia)",
+ .base.cra_driver_name = "__ecb-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(camellia)",
+ .base.cra_driver_name = "__cbc-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(camellia)",
+ .base.cra_driver_name = "__ctr-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .chunksize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(camellia)",
+ .base.cra_driver_name = "__xts-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = xts_camellia_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
+
+static int __init camellia_aesni_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(camellia_algs,
+ ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+ simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/marvell/linux/arch/x86/crypto/camellia_aesni_avx_glue.c b/marvell/linux/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 0000000..31a82a7
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
+
+asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
+
+asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
+
+asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_ctr_16way);
+
+asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
+
+asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
+
+void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk);
+}
+EXPORT_SYMBOL_GPL(camellia_xts_enc);
+
+void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk);
+}
+EXPORT_SYMBOL_GPL(camellia_xts_dec);
+
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = camellia_ecb_enc_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = camellia_enc_blk_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = camellia_enc_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = camellia_ctr_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ctr = camellia_crypt_ctr_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = camellia_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx camellia_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = camellia_xts_enc_16way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = camellia_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = camellia_ecb_dec_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = camellia_dec_blk_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = camellia_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = camellia_cbc_dec_16way }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .cbc = camellia_decrypt_cbc_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = camellia_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = camellia_xts_dec_16way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = camellia_xts_dec }
+ } }
+};
+
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
+ &tfm->base.crt_flags);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&camellia_ctr, req);
+}
+
+int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *flags = &tfm->base.crt_flags;
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+EXPORT_SYMBOL_GPL(xts_camellia_setkey);
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+}
+
+static struct skcipher_alg camellia_algs[] = {
+ {
+ .base.cra_name = "__ecb(camellia)",
+ .base.cra_driver_name = "__ecb-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(camellia)",
+ .base.cra_driver_name = "__cbc-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(camellia)",
+ .base.cra_driver_name = "__ctr-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .chunksize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(camellia)",
+ .base.cra_driver_name = "__xts-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = xts_camellia_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
+
+static int __init camellia_aesni_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(camellia_algs,
+ ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+ simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/marvell/linux/arch/x86/crypto/camellia_glue.c b/marvell/linux/arch/x86/crypto/camellia_glue.c
new file mode 100644
index 0000000..5f3ed5a
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/camellia_glue.c
@@ -0,0 +1,1515 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for assembler optimized version of Camellia
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Camellia parts based on code by:
+ * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
+ */
+
+#include <asm/unaligned.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/glue_helper.h>
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk);
+asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
+asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
+
+static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+/* camellia sboxes */
+__visible const u64 camellia_sp10011110[256] = {
+ 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
+ 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
+ 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
+ 0x8500008585858500ULL, 0x5700005757575700ULL, 0x3500003535353500ULL,
+ 0xea0000eaeaeaea00ULL, 0x0c00000c0c0c0c00ULL, 0xae0000aeaeaeae00ULL,
+ 0x4100004141414100ULL, 0x2300002323232300ULL, 0xef0000efefefef00ULL,
+ 0x6b00006b6b6b6b00ULL, 0x9300009393939300ULL, 0x4500004545454500ULL,
+ 0x1900001919191900ULL, 0xa50000a5a5a5a500ULL, 0x2100002121212100ULL,
+ 0xed0000edededed00ULL, 0x0e00000e0e0e0e00ULL, 0x4f00004f4f4f4f00ULL,
+ 0x4e00004e4e4e4e00ULL, 0x1d00001d1d1d1d00ULL, 0x6500006565656500ULL,
+ 0x9200009292929200ULL, 0xbd0000bdbdbdbd00ULL, 0x8600008686868600ULL,
+ 0xb80000b8b8b8b800ULL, 0xaf0000afafafaf00ULL, 0x8f00008f8f8f8f00ULL,
+ 0x7c00007c7c7c7c00ULL, 0xeb0000ebebebeb00ULL, 0x1f00001f1f1f1f00ULL,
+ 0xce0000cececece00ULL, 0x3e00003e3e3e3e00ULL, 0x3000003030303000ULL,
+ 0xdc0000dcdcdcdc00ULL, 0x5f00005f5f5f5f00ULL, 0x5e00005e5e5e5e00ULL,
+ 0xc50000c5c5c5c500ULL, 0x0b00000b0b0b0b00ULL, 0x1a00001a1a1a1a00ULL,
+ 0xa60000a6a6a6a600ULL, 0xe10000e1e1e1e100ULL, 0x3900003939393900ULL,
+ 0xca0000cacacaca00ULL, 0xd50000d5d5d5d500ULL, 0x4700004747474700ULL,
+ 0x5d00005d5d5d5d00ULL, 0x3d00003d3d3d3d00ULL, 0xd90000d9d9d9d900ULL,
+ 0x0100000101010100ULL, 0x5a00005a5a5a5a00ULL, 0xd60000d6d6d6d600ULL,
+ 0x5100005151515100ULL, 0x5600005656565600ULL, 0x6c00006c6c6c6c00ULL,
+ 0x4d00004d4d4d4d00ULL, 0x8b00008b8b8b8b00ULL, 0x0d00000d0d0d0d00ULL,
+ 0x9a00009a9a9a9a00ULL, 0x6600006666666600ULL, 0xfb0000fbfbfbfb00ULL,
+ 0xcc0000cccccccc00ULL, 0xb00000b0b0b0b000ULL, 0x2d00002d2d2d2d00ULL,
+ 0x7400007474747400ULL, 0x1200001212121200ULL, 0x2b00002b2b2b2b00ULL,
+ 0x2000002020202000ULL, 0xf00000f0f0f0f000ULL, 0xb10000b1b1b1b100ULL,
+ 0x8400008484848400ULL, 0x9900009999999900ULL, 0xdf0000dfdfdfdf00ULL,
+ 0x4c00004c4c4c4c00ULL, 0xcb0000cbcbcbcb00ULL, 0xc20000c2c2c2c200ULL,
+ 0x3400003434343400ULL, 0x7e00007e7e7e7e00ULL, 0x7600007676767600ULL,
+ 0x0500000505050500ULL, 0x6d00006d6d6d6d00ULL, 0xb70000b7b7b7b700ULL,
+ 0xa90000a9a9a9a900ULL, 0x3100003131313100ULL, 0xd10000d1d1d1d100ULL,
+ 0x1700001717171700ULL, 0x0400000404040400ULL, 0xd70000d7d7d7d700ULL,
+ 0x1400001414141400ULL, 0x5800005858585800ULL, 0x3a00003a3a3a3a00ULL,
+ 0x6100006161616100ULL, 0xde0000dededede00ULL, 0x1b00001b1b1b1b00ULL,
+ 0x1100001111111100ULL, 0x1c00001c1c1c1c00ULL, 0x3200003232323200ULL,
+ 0x0f00000f0f0f0f00ULL, 0x9c00009c9c9c9c00ULL, 0x1600001616161600ULL,
+ 0x5300005353535300ULL, 0x1800001818181800ULL, 0xf20000f2f2f2f200ULL,
+ 0x2200002222222200ULL, 0xfe0000fefefefe00ULL, 0x4400004444444400ULL,
+ 0xcf0000cfcfcfcf00ULL, 0xb20000b2b2b2b200ULL, 0xc30000c3c3c3c300ULL,
+ 0xb50000b5b5b5b500ULL, 0x7a00007a7a7a7a00ULL, 0x9100009191919100ULL,
+ 0x2400002424242400ULL, 0x0800000808080800ULL, 0xe80000e8e8e8e800ULL,
+ 0xa80000a8a8a8a800ULL, 0x6000006060606000ULL, 0xfc0000fcfcfcfc00ULL,
+ 0x6900006969696900ULL, 0x5000005050505000ULL, 0xaa0000aaaaaaaa00ULL,
+ 0xd00000d0d0d0d000ULL, 0xa00000a0a0a0a000ULL, 0x7d00007d7d7d7d00ULL,
+ 0xa10000a1a1a1a100ULL, 0x8900008989898900ULL, 0x6200006262626200ULL,
+ 0x9700009797979700ULL, 0x5400005454545400ULL, 0x5b00005b5b5b5b00ULL,
+ 0x1e00001e1e1e1e00ULL, 0x9500009595959500ULL, 0xe00000e0e0e0e000ULL,
+ 0xff0000ffffffff00ULL, 0x6400006464646400ULL, 0xd20000d2d2d2d200ULL,
+ 0x1000001010101000ULL, 0xc40000c4c4c4c400ULL, 0x0000000000000000ULL,
+ 0x4800004848484800ULL, 0xa30000a3a3a3a300ULL, 0xf70000f7f7f7f700ULL,
+ 0x7500007575757500ULL, 0xdb0000dbdbdbdb00ULL, 0x8a00008a8a8a8a00ULL,
+ 0x0300000303030300ULL, 0xe60000e6e6e6e600ULL, 0xda0000dadadada00ULL,
+ 0x0900000909090900ULL, 0x3f00003f3f3f3f00ULL, 0xdd0000dddddddd00ULL,
+ 0x9400009494949400ULL, 0x8700008787878700ULL, 0x5c00005c5c5c5c00ULL,
+ 0x8300008383838300ULL, 0x0200000202020200ULL, 0xcd0000cdcdcdcd00ULL,
+ 0x4a00004a4a4a4a00ULL, 0x9000009090909000ULL, 0x3300003333333300ULL,
+ 0x7300007373737300ULL, 0x6700006767676700ULL, 0xf60000f6f6f6f600ULL,
+ 0xf30000f3f3f3f300ULL, 0x9d00009d9d9d9d00ULL, 0x7f00007f7f7f7f00ULL,
+ 0xbf0000bfbfbfbf00ULL, 0xe20000e2e2e2e200ULL, 0x5200005252525200ULL,
+ 0x9b00009b9b9b9b00ULL, 0xd80000d8d8d8d800ULL, 0x2600002626262600ULL,
+ 0xc80000c8c8c8c800ULL, 0x3700003737373700ULL, 0xc60000c6c6c6c600ULL,
+ 0x3b00003b3b3b3b00ULL, 0x8100008181818100ULL, 0x9600009696969600ULL,
+ 0x6f00006f6f6f6f00ULL, 0x4b00004b4b4b4b00ULL, 0x1300001313131300ULL,
+ 0xbe0000bebebebe00ULL, 0x6300006363636300ULL, 0x2e00002e2e2e2e00ULL,
+ 0xe90000e9e9e9e900ULL, 0x7900007979797900ULL, 0xa70000a7a7a7a700ULL,
+ 0x8c00008c8c8c8c00ULL, 0x9f00009f9f9f9f00ULL, 0x6e00006e6e6e6e00ULL,
+ 0xbc0000bcbcbcbc00ULL, 0x8e00008e8e8e8e00ULL, 0x2900002929292900ULL,
+ 0xf50000f5f5f5f500ULL, 0xf90000f9f9f9f900ULL, 0xb60000b6b6b6b600ULL,
+ 0x2f00002f2f2f2f00ULL, 0xfd0000fdfdfdfd00ULL, 0xb40000b4b4b4b400ULL,
+ 0x5900005959595900ULL, 0x7800007878787800ULL, 0x9800009898989800ULL,
+ 0x0600000606060600ULL, 0x6a00006a6a6a6a00ULL, 0xe70000e7e7e7e700ULL,
+ 0x4600004646464600ULL, 0x7100007171717100ULL, 0xba0000babababa00ULL,
+ 0xd40000d4d4d4d400ULL, 0x2500002525252500ULL, 0xab0000abababab00ULL,
+ 0x4200004242424200ULL, 0x8800008888888800ULL, 0xa20000a2a2a2a200ULL,
+ 0x8d00008d8d8d8d00ULL, 0xfa0000fafafafa00ULL, 0x7200007272727200ULL,
+ 0x0700000707070700ULL, 0xb90000b9b9b9b900ULL, 0x5500005555555500ULL,
+ 0xf80000f8f8f8f800ULL, 0xee0000eeeeeeee00ULL, 0xac0000acacacac00ULL,
+ 0x0a00000a0a0a0a00ULL, 0x3600003636363600ULL, 0x4900004949494900ULL,
+ 0x2a00002a2a2a2a00ULL, 0x6800006868686800ULL, 0x3c00003c3c3c3c00ULL,
+ 0x3800003838383800ULL, 0xf10000f1f1f1f100ULL, 0xa40000a4a4a4a400ULL,
+ 0x4000004040404000ULL, 0x2800002828282800ULL, 0xd30000d3d3d3d300ULL,
+ 0x7b00007b7b7b7b00ULL, 0xbb0000bbbbbbbb00ULL, 0xc90000c9c9c9c900ULL,
+ 0x4300004343434300ULL, 0xc10000c1c1c1c100ULL, 0x1500001515151500ULL,
+ 0xe30000e3e3e3e300ULL, 0xad0000adadadad00ULL, 0xf40000f4f4f4f400ULL,
+ 0x7700007777777700ULL, 0xc70000c7c7c7c700ULL, 0x8000008080808000ULL,
+ 0x9e00009e9e9e9e00ULL,
+};
+
+__visible const u64 camellia_sp22000222[256] = {
+ 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
+ 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
+ 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
+ 0x0b0b0000000b0b0bULL, 0xaeae000000aeaeaeULL, 0x6a6a0000006a6a6aULL,
+ 0xd5d5000000d5d5d5ULL, 0x1818000000181818ULL, 0x5d5d0000005d5d5dULL,
+ 0x8282000000828282ULL, 0x4646000000464646ULL, 0xdfdf000000dfdfdfULL,
+ 0xd6d6000000d6d6d6ULL, 0x2727000000272727ULL, 0x8a8a0000008a8a8aULL,
+ 0x3232000000323232ULL, 0x4b4b0000004b4b4bULL, 0x4242000000424242ULL,
+ 0xdbdb000000dbdbdbULL, 0x1c1c0000001c1c1cULL, 0x9e9e0000009e9e9eULL,
+ 0x9c9c0000009c9c9cULL, 0x3a3a0000003a3a3aULL, 0xcaca000000cacacaULL,
+ 0x2525000000252525ULL, 0x7b7b0000007b7b7bULL, 0x0d0d0000000d0d0dULL,
+ 0x7171000000717171ULL, 0x5f5f0000005f5f5fULL, 0x1f1f0000001f1f1fULL,
+ 0xf8f8000000f8f8f8ULL, 0xd7d7000000d7d7d7ULL, 0x3e3e0000003e3e3eULL,
+ 0x9d9d0000009d9d9dULL, 0x7c7c0000007c7c7cULL, 0x6060000000606060ULL,
+ 0xb9b9000000b9b9b9ULL, 0xbebe000000bebebeULL, 0xbcbc000000bcbcbcULL,
+ 0x8b8b0000008b8b8bULL, 0x1616000000161616ULL, 0x3434000000343434ULL,
+ 0x4d4d0000004d4d4dULL, 0xc3c3000000c3c3c3ULL, 0x7272000000727272ULL,
+ 0x9595000000959595ULL, 0xabab000000abababULL, 0x8e8e0000008e8e8eULL,
+ 0xbaba000000bababaULL, 0x7a7a0000007a7a7aULL, 0xb3b3000000b3b3b3ULL,
+ 0x0202000000020202ULL, 0xb4b4000000b4b4b4ULL, 0xadad000000adadadULL,
+ 0xa2a2000000a2a2a2ULL, 0xacac000000acacacULL, 0xd8d8000000d8d8d8ULL,
+ 0x9a9a0000009a9a9aULL, 0x1717000000171717ULL, 0x1a1a0000001a1a1aULL,
+ 0x3535000000353535ULL, 0xcccc000000ccccccULL, 0xf7f7000000f7f7f7ULL,
+ 0x9999000000999999ULL, 0x6161000000616161ULL, 0x5a5a0000005a5a5aULL,
+ 0xe8e8000000e8e8e8ULL, 0x2424000000242424ULL, 0x5656000000565656ULL,
+ 0x4040000000404040ULL, 0xe1e1000000e1e1e1ULL, 0x6363000000636363ULL,
+ 0x0909000000090909ULL, 0x3333000000333333ULL, 0xbfbf000000bfbfbfULL,
+ 0x9898000000989898ULL, 0x9797000000979797ULL, 0x8585000000858585ULL,
+ 0x6868000000686868ULL, 0xfcfc000000fcfcfcULL, 0xecec000000ecececULL,
+ 0x0a0a0000000a0a0aULL, 0xdada000000dadadaULL, 0x6f6f0000006f6f6fULL,
+ 0x5353000000535353ULL, 0x6262000000626262ULL, 0xa3a3000000a3a3a3ULL,
+ 0x2e2e0000002e2e2eULL, 0x0808000000080808ULL, 0xafaf000000afafafULL,
+ 0x2828000000282828ULL, 0xb0b0000000b0b0b0ULL, 0x7474000000747474ULL,
+ 0xc2c2000000c2c2c2ULL, 0xbdbd000000bdbdbdULL, 0x3636000000363636ULL,
+ 0x2222000000222222ULL, 0x3838000000383838ULL, 0x6464000000646464ULL,
+ 0x1e1e0000001e1e1eULL, 0x3939000000393939ULL, 0x2c2c0000002c2c2cULL,
+ 0xa6a6000000a6a6a6ULL, 0x3030000000303030ULL, 0xe5e5000000e5e5e5ULL,
+ 0x4444000000444444ULL, 0xfdfd000000fdfdfdULL, 0x8888000000888888ULL,
+ 0x9f9f0000009f9f9fULL, 0x6565000000656565ULL, 0x8787000000878787ULL,
+ 0x6b6b0000006b6b6bULL, 0xf4f4000000f4f4f4ULL, 0x2323000000232323ULL,
+ 0x4848000000484848ULL, 0x1010000000101010ULL, 0xd1d1000000d1d1d1ULL,
+ 0x5151000000515151ULL, 0xc0c0000000c0c0c0ULL, 0xf9f9000000f9f9f9ULL,
+ 0xd2d2000000d2d2d2ULL, 0xa0a0000000a0a0a0ULL, 0x5555000000555555ULL,
+ 0xa1a1000000a1a1a1ULL, 0x4141000000414141ULL, 0xfafa000000fafafaULL,
+ 0x4343000000434343ULL, 0x1313000000131313ULL, 0xc4c4000000c4c4c4ULL,
+ 0x2f2f0000002f2f2fULL, 0xa8a8000000a8a8a8ULL, 0xb6b6000000b6b6b6ULL,
+ 0x3c3c0000003c3c3cULL, 0x2b2b0000002b2b2bULL, 0xc1c1000000c1c1c1ULL,
+ 0xffff000000ffffffULL, 0xc8c8000000c8c8c8ULL, 0xa5a5000000a5a5a5ULL,
+ 0x2020000000202020ULL, 0x8989000000898989ULL, 0x0000000000000000ULL,
+ 0x9090000000909090ULL, 0x4747000000474747ULL, 0xefef000000efefefULL,
+ 0xeaea000000eaeaeaULL, 0xb7b7000000b7b7b7ULL, 0x1515000000151515ULL,
+ 0x0606000000060606ULL, 0xcdcd000000cdcdcdULL, 0xb5b5000000b5b5b5ULL,
+ 0x1212000000121212ULL, 0x7e7e0000007e7e7eULL, 0xbbbb000000bbbbbbULL,
+ 0x2929000000292929ULL, 0x0f0f0000000f0f0fULL, 0xb8b8000000b8b8b8ULL,
+ 0x0707000000070707ULL, 0x0404000000040404ULL, 0x9b9b0000009b9b9bULL,
+ 0x9494000000949494ULL, 0x2121000000212121ULL, 0x6666000000666666ULL,
+ 0xe6e6000000e6e6e6ULL, 0xcece000000cececeULL, 0xeded000000edededULL,
+ 0xe7e7000000e7e7e7ULL, 0x3b3b0000003b3b3bULL, 0xfefe000000fefefeULL,
+ 0x7f7f0000007f7f7fULL, 0xc5c5000000c5c5c5ULL, 0xa4a4000000a4a4a4ULL,
+ 0x3737000000373737ULL, 0xb1b1000000b1b1b1ULL, 0x4c4c0000004c4c4cULL,
+ 0x9191000000919191ULL, 0x6e6e0000006e6e6eULL, 0x8d8d0000008d8d8dULL,
+ 0x7676000000767676ULL, 0x0303000000030303ULL, 0x2d2d0000002d2d2dULL,
+ 0xdede000000dededeULL, 0x9696000000969696ULL, 0x2626000000262626ULL,
+ 0x7d7d0000007d7d7dULL, 0xc6c6000000c6c6c6ULL, 0x5c5c0000005c5c5cULL,
+ 0xd3d3000000d3d3d3ULL, 0xf2f2000000f2f2f2ULL, 0x4f4f0000004f4f4fULL,
+ 0x1919000000191919ULL, 0x3f3f0000003f3f3fULL, 0xdcdc000000dcdcdcULL,
+ 0x7979000000797979ULL, 0x1d1d0000001d1d1dULL, 0x5252000000525252ULL,
+ 0xebeb000000ebebebULL, 0xf3f3000000f3f3f3ULL, 0x6d6d0000006d6d6dULL,
+ 0x5e5e0000005e5e5eULL, 0xfbfb000000fbfbfbULL, 0x6969000000696969ULL,
+ 0xb2b2000000b2b2b2ULL, 0xf0f0000000f0f0f0ULL, 0x3131000000313131ULL,
+ 0x0c0c0000000c0c0cULL, 0xd4d4000000d4d4d4ULL, 0xcfcf000000cfcfcfULL,
+ 0x8c8c0000008c8c8cULL, 0xe2e2000000e2e2e2ULL, 0x7575000000757575ULL,
+ 0xa9a9000000a9a9a9ULL, 0x4a4a0000004a4a4aULL, 0x5757000000575757ULL,
+ 0x8484000000848484ULL, 0x1111000000111111ULL, 0x4545000000454545ULL,
+ 0x1b1b0000001b1b1bULL, 0xf5f5000000f5f5f5ULL, 0xe4e4000000e4e4e4ULL,
+ 0x0e0e0000000e0e0eULL, 0x7373000000737373ULL, 0xaaaa000000aaaaaaULL,
+ 0xf1f1000000f1f1f1ULL, 0xdddd000000ddddddULL, 0x5959000000595959ULL,
+ 0x1414000000141414ULL, 0x6c6c0000006c6c6cULL, 0x9292000000929292ULL,
+ 0x5454000000545454ULL, 0xd0d0000000d0d0d0ULL, 0x7878000000787878ULL,
+ 0x7070000000707070ULL, 0xe3e3000000e3e3e3ULL, 0x4949000000494949ULL,
+ 0x8080000000808080ULL, 0x5050000000505050ULL, 0xa7a7000000a7a7a7ULL,
+ 0xf6f6000000f6f6f6ULL, 0x7777000000777777ULL, 0x9393000000939393ULL,
+ 0x8686000000868686ULL, 0x8383000000838383ULL, 0x2a2a0000002a2a2aULL,
+ 0xc7c7000000c7c7c7ULL, 0x5b5b0000005b5b5bULL, 0xe9e9000000e9e9e9ULL,
+ 0xeeee000000eeeeeeULL, 0x8f8f0000008f8f8fULL, 0x0101000000010101ULL,
+ 0x3d3d0000003d3d3dULL,
+};
+
+__visible const u64 camellia_sp03303033[256] = {
+ 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
+ 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
+ 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
+ 0x00c2c200c200c2c2ULL, 0x00abab00ab00ababULL, 0x009a9a009a009a9aULL,
+ 0x0075750075007575ULL, 0x0006060006000606ULL, 0x0057570057005757ULL,
+ 0x00a0a000a000a0a0ULL, 0x0091910091009191ULL, 0x00f7f700f700f7f7ULL,
+ 0x00b5b500b500b5b5ULL, 0x00c9c900c900c9c9ULL, 0x00a2a200a200a2a2ULL,
+ 0x008c8c008c008c8cULL, 0x00d2d200d200d2d2ULL, 0x0090900090009090ULL,
+ 0x00f6f600f600f6f6ULL, 0x0007070007000707ULL, 0x00a7a700a700a7a7ULL,
+ 0x0027270027002727ULL, 0x008e8e008e008e8eULL, 0x00b2b200b200b2b2ULL,
+ 0x0049490049004949ULL, 0x00dede00de00dedeULL, 0x0043430043004343ULL,
+ 0x005c5c005c005c5cULL, 0x00d7d700d700d7d7ULL, 0x00c7c700c700c7c7ULL,
+ 0x003e3e003e003e3eULL, 0x00f5f500f500f5f5ULL, 0x008f8f008f008f8fULL,
+ 0x0067670067006767ULL, 0x001f1f001f001f1fULL, 0x0018180018001818ULL,
+ 0x006e6e006e006e6eULL, 0x00afaf00af00afafULL, 0x002f2f002f002f2fULL,
+ 0x00e2e200e200e2e2ULL, 0x0085850085008585ULL, 0x000d0d000d000d0dULL,
+ 0x0053530053005353ULL, 0x00f0f000f000f0f0ULL, 0x009c9c009c009c9cULL,
+ 0x0065650065006565ULL, 0x00eaea00ea00eaeaULL, 0x00a3a300a300a3a3ULL,
+ 0x00aeae00ae00aeaeULL, 0x009e9e009e009e9eULL, 0x00ecec00ec00ececULL,
+ 0x0080800080008080ULL, 0x002d2d002d002d2dULL, 0x006b6b006b006b6bULL,
+ 0x00a8a800a800a8a8ULL, 0x002b2b002b002b2bULL, 0x0036360036003636ULL,
+ 0x00a6a600a600a6a6ULL, 0x00c5c500c500c5c5ULL, 0x0086860086008686ULL,
+ 0x004d4d004d004d4dULL, 0x0033330033003333ULL, 0x00fdfd00fd00fdfdULL,
+ 0x0066660066006666ULL, 0x0058580058005858ULL, 0x0096960096009696ULL,
+ 0x003a3a003a003a3aULL, 0x0009090009000909ULL, 0x0095950095009595ULL,
+ 0x0010100010001010ULL, 0x0078780078007878ULL, 0x00d8d800d800d8d8ULL,
+ 0x0042420042004242ULL, 0x00cccc00cc00ccccULL, 0x00efef00ef00efefULL,
+ 0x0026260026002626ULL, 0x00e5e500e500e5e5ULL, 0x0061610061006161ULL,
+ 0x001a1a001a001a1aULL, 0x003f3f003f003f3fULL, 0x003b3b003b003b3bULL,
+ 0x0082820082008282ULL, 0x00b6b600b600b6b6ULL, 0x00dbdb00db00dbdbULL,
+ 0x00d4d400d400d4d4ULL, 0x0098980098009898ULL, 0x00e8e800e800e8e8ULL,
+ 0x008b8b008b008b8bULL, 0x0002020002000202ULL, 0x00ebeb00eb00ebebULL,
+ 0x000a0a000a000a0aULL, 0x002c2c002c002c2cULL, 0x001d1d001d001d1dULL,
+ 0x00b0b000b000b0b0ULL, 0x006f6f006f006f6fULL, 0x008d8d008d008d8dULL,
+ 0x0088880088008888ULL, 0x000e0e000e000e0eULL, 0x0019190019001919ULL,
+ 0x0087870087008787ULL, 0x004e4e004e004e4eULL, 0x000b0b000b000b0bULL,
+ 0x00a9a900a900a9a9ULL, 0x000c0c000c000c0cULL, 0x0079790079007979ULL,
+ 0x0011110011001111ULL, 0x007f7f007f007f7fULL, 0x0022220022002222ULL,
+ 0x00e7e700e700e7e7ULL, 0x0059590059005959ULL, 0x00e1e100e100e1e1ULL,
+ 0x00dada00da00dadaULL, 0x003d3d003d003d3dULL, 0x00c8c800c800c8c8ULL,
+ 0x0012120012001212ULL, 0x0004040004000404ULL, 0x0074740074007474ULL,
+ 0x0054540054005454ULL, 0x0030300030003030ULL, 0x007e7e007e007e7eULL,
+ 0x00b4b400b400b4b4ULL, 0x0028280028002828ULL, 0x0055550055005555ULL,
+ 0x0068680068006868ULL, 0x0050500050005050ULL, 0x00bebe00be00bebeULL,
+ 0x00d0d000d000d0d0ULL, 0x00c4c400c400c4c4ULL, 0x0031310031003131ULL,
+ 0x00cbcb00cb00cbcbULL, 0x002a2a002a002a2aULL, 0x00adad00ad00adadULL,
+ 0x000f0f000f000f0fULL, 0x00caca00ca00cacaULL, 0x0070700070007070ULL,
+ 0x00ffff00ff00ffffULL, 0x0032320032003232ULL, 0x0069690069006969ULL,
+ 0x0008080008000808ULL, 0x0062620062006262ULL, 0x0000000000000000ULL,
+ 0x0024240024002424ULL, 0x00d1d100d100d1d1ULL, 0x00fbfb00fb00fbfbULL,
+ 0x00baba00ba00babaULL, 0x00eded00ed00ededULL, 0x0045450045004545ULL,
+ 0x0081810081008181ULL, 0x0073730073007373ULL, 0x006d6d006d006d6dULL,
+ 0x0084840084008484ULL, 0x009f9f009f009f9fULL, 0x00eeee00ee00eeeeULL,
+ 0x004a4a004a004a4aULL, 0x00c3c300c300c3c3ULL, 0x002e2e002e002e2eULL,
+ 0x00c1c100c100c1c1ULL, 0x0001010001000101ULL, 0x00e6e600e600e6e6ULL,
+ 0x0025250025002525ULL, 0x0048480048004848ULL, 0x0099990099009999ULL,
+ 0x00b9b900b900b9b9ULL, 0x00b3b300b300b3b3ULL, 0x007b7b007b007b7bULL,
+ 0x00f9f900f900f9f9ULL, 0x00cece00ce00ceceULL, 0x00bfbf00bf00bfbfULL,
+ 0x00dfdf00df00dfdfULL, 0x0071710071007171ULL, 0x0029290029002929ULL,
+ 0x00cdcd00cd00cdcdULL, 0x006c6c006c006c6cULL, 0x0013130013001313ULL,
+ 0x0064640064006464ULL, 0x009b9b009b009b9bULL, 0x0063630063006363ULL,
+ 0x009d9d009d009d9dULL, 0x00c0c000c000c0c0ULL, 0x004b4b004b004b4bULL,
+ 0x00b7b700b700b7b7ULL, 0x00a5a500a500a5a5ULL, 0x0089890089008989ULL,
+ 0x005f5f005f005f5fULL, 0x00b1b100b100b1b1ULL, 0x0017170017001717ULL,
+ 0x00f4f400f400f4f4ULL, 0x00bcbc00bc00bcbcULL, 0x00d3d300d300d3d3ULL,
+ 0x0046460046004646ULL, 0x00cfcf00cf00cfcfULL, 0x0037370037003737ULL,
+ 0x005e5e005e005e5eULL, 0x0047470047004747ULL, 0x0094940094009494ULL,
+ 0x00fafa00fa00fafaULL, 0x00fcfc00fc00fcfcULL, 0x005b5b005b005b5bULL,
+ 0x0097970097009797ULL, 0x00fefe00fe00fefeULL, 0x005a5a005a005a5aULL,
+ 0x00acac00ac00acacULL, 0x003c3c003c003c3cULL, 0x004c4c004c004c4cULL,
+ 0x0003030003000303ULL, 0x0035350035003535ULL, 0x00f3f300f300f3f3ULL,
+ 0x0023230023002323ULL, 0x00b8b800b800b8b8ULL, 0x005d5d005d005d5dULL,
+ 0x006a6a006a006a6aULL, 0x0092920092009292ULL, 0x00d5d500d500d5d5ULL,
+ 0x0021210021002121ULL, 0x0044440044004444ULL, 0x0051510051005151ULL,
+ 0x00c6c600c600c6c6ULL, 0x007d7d007d007d7dULL, 0x0039390039003939ULL,
+ 0x0083830083008383ULL, 0x00dcdc00dc00dcdcULL, 0x00aaaa00aa00aaaaULL,
+ 0x007c7c007c007c7cULL, 0x0077770077007777ULL, 0x0056560056005656ULL,
+ 0x0005050005000505ULL, 0x001b1b001b001b1bULL, 0x00a4a400a400a4a4ULL,
+ 0x0015150015001515ULL, 0x0034340034003434ULL, 0x001e1e001e001e1eULL,
+ 0x001c1c001c001c1cULL, 0x00f8f800f800f8f8ULL, 0x0052520052005252ULL,
+ 0x0020200020002020ULL, 0x0014140014001414ULL, 0x00e9e900e900e9e9ULL,
+ 0x00bdbd00bd00bdbdULL, 0x00dddd00dd00ddddULL, 0x00e4e400e400e4e4ULL,
+ 0x00a1a100a100a1a1ULL, 0x00e0e000e000e0e0ULL, 0x008a8a008a008a8aULL,
+ 0x00f1f100f100f1f1ULL, 0x00d6d600d600d6d6ULL, 0x007a7a007a007a7aULL,
+ 0x00bbbb00bb00bbbbULL, 0x00e3e300e300e3e3ULL, 0x0040400040004040ULL,
+ 0x004f4f004f004f4fULL,
+};
+
+__visible const u64 camellia_sp00444404[256] = {
+ 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
+ 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
+ 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
+ 0x00006b6b6b6b006bULL, 0x0000454545450045ULL, 0x0000a5a5a5a500a5ULL,
+ 0x0000edededed00edULL, 0x00004f4f4f4f004fULL, 0x00001d1d1d1d001dULL,
+ 0x0000929292920092ULL, 0x0000868686860086ULL, 0x0000afafafaf00afULL,
+ 0x00007c7c7c7c007cULL, 0x00001f1f1f1f001fULL, 0x00003e3e3e3e003eULL,
+ 0x0000dcdcdcdc00dcULL, 0x00005e5e5e5e005eULL, 0x00000b0b0b0b000bULL,
+ 0x0000a6a6a6a600a6ULL, 0x0000393939390039ULL, 0x0000d5d5d5d500d5ULL,
+ 0x00005d5d5d5d005dULL, 0x0000d9d9d9d900d9ULL, 0x00005a5a5a5a005aULL,
+ 0x0000515151510051ULL, 0x00006c6c6c6c006cULL, 0x00008b8b8b8b008bULL,
+ 0x00009a9a9a9a009aULL, 0x0000fbfbfbfb00fbULL, 0x0000b0b0b0b000b0ULL,
+ 0x0000747474740074ULL, 0x00002b2b2b2b002bULL, 0x0000f0f0f0f000f0ULL,
+ 0x0000848484840084ULL, 0x0000dfdfdfdf00dfULL, 0x0000cbcbcbcb00cbULL,
+ 0x0000343434340034ULL, 0x0000767676760076ULL, 0x00006d6d6d6d006dULL,
+ 0x0000a9a9a9a900a9ULL, 0x0000d1d1d1d100d1ULL, 0x0000040404040004ULL,
+ 0x0000141414140014ULL, 0x00003a3a3a3a003aULL, 0x0000dededede00deULL,
+ 0x0000111111110011ULL, 0x0000323232320032ULL, 0x00009c9c9c9c009cULL,
+ 0x0000535353530053ULL, 0x0000f2f2f2f200f2ULL, 0x0000fefefefe00feULL,
+ 0x0000cfcfcfcf00cfULL, 0x0000c3c3c3c300c3ULL, 0x00007a7a7a7a007aULL,
+ 0x0000242424240024ULL, 0x0000e8e8e8e800e8ULL, 0x0000606060600060ULL,
+ 0x0000696969690069ULL, 0x0000aaaaaaaa00aaULL, 0x0000a0a0a0a000a0ULL,
+ 0x0000a1a1a1a100a1ULL, 0x0000626262620062ULL, 0x0000545454540054ULL,
+ 0x00001e1e1e1e001eULL, 0x0000e0e0e0e000e0ULL, 0x0000646464640064ULL,
+ 0x0000101010100010ULL, 0x0000000000000000ULL, 0x0000a3a3a3a300a3ULL,
+ 0x0000757575750075ULL, 0x00008a8a8a8a008aULL, 0x0000e6e6e6e600e6ULL,
+ 0x0000090909090009ULL, 0x0000dddddddd00ddULL, 0x0000878787870087ULL,
+ 0x0000838383830083ULL, 0x0000cdcdcdcd00cdULL, 0x0000909090900090ULL,
+ 0x0000737373730073ULL, 0x0000f6f6f6f600f6ULL, 0x00009d9d9d9d009dULL,
+ 0x0000bfbfbfbf00bfULL, 0x0000525252520052ULL, 0x0000d8d8d8d800d8ULL,
+ 0x0000c8c8c8c800c8ULL, 0x0000c6c6c6c600c6ULL, 0x0000818181810081ULL,
+ 0x00006f6f6f6f006fULL, 0x0000131313130013ULL, 0x0000636363630063ULL,
+ 0x0000e9e9e9e900e9ULL, 0x0000a7a7a7a700a7ULL, 0x00009f9f9f9f009fULL,
+ 0x0000bcbcbcbc00bcULL, 0x0000292929290029ULL, 0x0000f9f9f9f900f9ULL,
+ 0x00002f2f2f2f002fULL, 0x0000b4b4b4b400b4ULL, 0x0000787878780078ULL,
+ 0x0000060606060006ULL, 0x0000e7e7e7e700e7ULL, 0x0000717171710071ULL,
+ 0x0000d4d4d4d400d4ULL, 0x0000abababab00abULL, 0x0000888888880088ULL,
+ 0x00008d8d8d8d008dULL, 0x0000727272720072ULL, 0x0000b9b9b9b900b9ULL,
+ 0x0000f8f8f8f800f8ULL, 0x0000acacacac00acULL, 0x0000363636360036ULL,
+ 0x00002a2a2a2a002aULL, 0x00003c3c3c3c003cULL, 0x0000f1f1f1f100f1ULL,
+ 0x0000404040400040ULL, 0x0000d3d3d3d300d3ULL, 0x0000bbbbbbbb00bbULL,
+ 0x0000434343430043ULL, 0x0000151515150015ULL, 0x0000adadadad00adULL,
+ 0x0000777777770077ULL, 0x0000808080800080ULL, 0x0000828282820082ULL,
+ 0x0000ecececec00ecULL, 0x0000272727270027ULL, 0x0000e5e5e5e500e5ULL,
+ 0x0000858585850085ULL, 0x0000353535350035ULL, 0x00000c0c0c0c000cULL,
+ 0x0000414141410041ULL, 0x0000efefefef00efULL, 0x0000939393930093ULL,
+ 0x0000191919190019ULL, 0x0000212121210021ULL, 0x00000e0e0e0e000eULL,
+ 0x00004e4e4e4e004eULL, 0x0000656565650065ULL, 0x0000bdbdbdbd00bdULL,
+ 0x0000b8b8b8b800b8ULL, 0x00008f8f8f8f008fULL, 0x0000ebebebeb00ebULL,
+ 0x0000cececece00ceULL, 0x0000303030300030ULL, 0x00005f5f5f5f005fULL,
+ 0x0000c5c5c5c500c5ULL, 0x00001a1a1a1a001aULL, 0x0000e1e1e1e100e1ULL,
+ 0x0000cacacaca00caULL, 0x0000474747470047ULL, 0x00003d3d3d3d003dULL,
+ 0x0000010101010001ULL, 0x0000d6d6d6d600d6ULL, 0x0000565656560056ULL,
+ 0x00004d4d4d4d004dULL, 0x00000d0d0d0d000dULL, 0x0000666666660066ULL,
+ 0x0000cccccccc00ccULL, 0x00002d2d2d2d002dULL, 0x0000121212120012ULL,
+ 0x0000202020200020ULL, 0x0000b1b1b1b100b1ULL, 0x0000999999990099ULL,
+ 0x00004c4c4c4c004cULL, 0x0000c2c2c2c200c2ULL, 0x00007e7e7e7e007eULL,
+ 0x0000050505050005ULL, 0x0000b7b7b7b700b7ULL, 0x0000313131310031ULL,
+ 0x0000171717170017ULL, 0x0000d7d7d7d700d7ULL, 0x0000585858580058ULL,
+ 0x0000616161610061ULL, 0x00001b1b1b1b001bULL, 0x00001c1c1c1c001cULL,
+ 0x00000f0f0f0f000fULL, 0x0000161616160016ULL, 0x0000181818180018ULL,
+ 0x0000222222220022ULL, 0x0000444444440044ULL, 0x0000b2b2b2b200b2ULL,
+ 0x0000b5b5b5b500b5ULL, 0x0000919191910091ULL, 0x0000080808080008ULL,
+ 0x0000a8a8a8a800a8ULL, 0x0000fcfcfcfc00fcULL, 0x0000505050500050ULL,
+ 0x0000d0d0d0d000d0ULL, 0x00007d7d7d7d007dULL, 0x0000898989890089ULL,
+ 0x0000979797970097ULL, 0x00005b5b5b5b005bULL, 0x0000959595950095ULL,
+ 0x0000ffffffff00ffULL, 0x0000d2d2d2d200d2ULL, 0x0000c4c4c4c400c4ULL,
+ 0x0000484848480048ULL, 0x0000f7f7f7f700f7ULL, 0x0000dbdbdbdb00dbULL,
+ 0x0000030303030003ULL, 0x0000dadadada00daULL, 0x00003f3f3f3f003fULL,
+ 0x0000949494940094ULL, 0x00005c5c5c5c005cULL, 0x0000020202020002ULL,
+ 0x00004a4a4a4a004aULL, 0x0000333333330033ULL, 0x0000676767670067ULL,
+ 0x0000f3f3f3f300f3ULL, 0x00007f7f7f7f007fULL, 0x0000e2e2e2e200e2ULL,
+ 0x00009b9b9b9b009bULL, 0x0000262626260026ULL, 0x0000373737370037ULL,
+ 0x00003b3b3b3b003bULL, 0x0000969696960096ULL, 0x00004b4b4b4b004bULL,
+ 0x0000bebebebe00beULL, 0x00002e2e2e2e002eULL, 0x0000797979790079ULL,
+ 0x00008c8c8c8c008cULL, 0x00006e6e6e6e006eULL, 0x00008e8e8e8e008eULL,
+ 0x0000f5f5f5f500f5ULL, 0x0000b6b6b6b600b6ULL, 0x0000fdfdfdfd00fdULL,
+ 0x0000595959590059ULL, 0x0000989898980098ULL, 0x00006a6a6a6a006aULL,
+ 0x0000464646460046ULL, 0x0000babababa00baULL, 0x0000252525250025ULL,
+ 0x0000424242420042ULL, 0x0000a2a2a2a200a2ULL, 0x0000fafafafa00faULL,
+ 0x0000070707070007ULL, 0x0000555555550055ULL, 0x0000eeeeeeee00eeULL,
+ 0x00000a0a0a0a000aULL, 0x0000494949490049ULL, 0x0000686868680068ULL,
+ 0x0000383838380038ULL, 0x0000a4a4a4a400a4ULL, 0x0000282828280028ULL,
+ 0x00007b7b7b7b007bULL, 0x0000c9c9c9c900c9ULL, 0x0000c1c1c1c100c1ULL,
+ 0x0000e3e3e3e300e3ULL, 0x0000f4f4f4f400f4ULL, 0x0000c7c7c7c700c7ULL,
+ 0x00009e9e9e9e009eULL,
+};
+
+__visible const u64 camellia_sp02220222[256] = {
+ 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
+ 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
+ 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
+ 0x000b0b0b000b0b0bULL, 0x00aeaeae00aeaeaeULL, 0x006a6a6a006a6a6aULL,
+ 0x00d5d5d500d5d5d5ULL, 0x0018181800181818ULL, 0x005d5d5d005d5d5dULL,
+ 0x0082828200828282ULL, 0x0046464600464646ULL, 0x00dfdfdf00dfdfdfULL,
+ 0x00d6d6d600d6d6d6ULL, 0x0027272700272727ULL, 0x008a8a8a008a8a8aULL,
+ 0x0032323200323232ULL, 0x004b4b4b004b4b4bULL, 0x0042424200424242ULL,
+ 0x00dbdbdb00dbdbdbULL, 0x001c1c1c001c1c1cULL, 0x009e9e9e009e9e9eULL,
+ 0x009c9c9c009c9c9cULL, 0x003a3a3a003a3a3aULL, 0x00cacaca00cacacaULL,
+ 0x0025252500252525ULL, 0x007b7b7b007b7b7bULL, 0x000d0d0d000d0d0dULL,
+ 0x0071717100717171ULL, 0x005f5f5f005f5f5fULL, 0x001f1f1f001f1f1fULL,
+ 0x00f8f8f800f8f8f8ULL, 0x00d7d7d700d7d7d7ULL, 0x003e3e3e003e3e3eULL,
+ 0x009d9d9d009d9d9dULL, 0x007c7c7c007c7c7cULL, 0x0060606000606060ULL,
+ 0x00b9b9b900b9b9b9ULL, 0x00bebebe00bebebeULL, 0x00bcbcbc00bcbcbcULL,
+ 0x008b8b8b008b8b8bULL, 0x0016161600161616ULL, 0x0034343400343434ULL,
+ 0x004d4d4d004d4d4dULL, 0x00c3c3c300c3c3c3ULL, 0x0072727200727272ULL,
+ 0x0095959500959595ULL, 0x00ababab00abababULL, 0x008e8e8e008e8e8eULL,
+ 0x00bababa00bababaULL, 0x007a7a7a007a7a7aULL, 0x00b3b3b300b3b3b3ULL,
+ 0x0002020200020202ULL, 0x00b4b4b400b4b4b4ULL, 0x00adadad00adadadULL,
+ 0x00a2a2a200a2a2a2ULL, 0x00acacac00acacacULL, 0x00d8d8d800d8d8d8ULL,
+ 0x009a9a9a009a9a9aULL, 0x0017171700171717ULL, 0x001a1a1a001a1a1aULL,
+ 0x0035353500353535ULL, 0x00cccccc00ccccccULL, 0x00f7f7f700f7f7f7ULL,
+ 0x0099999900999999ULL, 0x0061616100616161ULL, 0x005a5a5a005a5a5aULL,
+ 0x00e8e8e800e8e8e8ULL, 0x0024242400242424ULL, 0x0056565600565656ULL,
+ 0x0040404000404040ULL, 0x00e1e1e100e1e1e1ULL, 0x0063636300636363ULL,
+ 0x0009090900090909ULL, 0x0033333300333333ULL, 0x00bfbfbf00bfbfbfULL,
+ 0x0098989800989898ULL, 0x0097979700979797ULL, 0x0085858500858585ULL,
+ 0x0068686800686868ULL, 0x00fcfcfc00fcfcfcULL, 0x00ececec00ecececULL,
+ 0x000a0a0a000a0a0aULL, 0x00dadada00dadadaULL, 0x006f6f6f006f6f6fULL,
+ 0x0053535300535353ULL, 0x0062626200626262ULL, 0x00a3a3a300a3a3a3ULL,
+ 0x002e2e2e002e2e2eULL, 0x0008080800080808ULL, 0x00afafaf00afafafULL,
+ 0x0028282800282828ULL, 0x00b0b0b000b0b0b0ULL, 0x0074747400747474ULL,
+ 0x00c2c2c200c2c2c2ULL, 0x00bdbdbd00bdbdbdULL, 0x0036363600363636ULL,
+ 0x0022222200222222ULL, 0x0038383800383838ULL, 0x0064646400646464ULL,
+ 0x001e1e1e001e1e1eULL, 0x0039393900393939ULL, 0x002c2c2c002c2c2cULL,
+ 0x00a6a6a600a6a6a6ULL, 0x0030303000303030ULL, 0x00e5e5e500e5e5e5ULL,
+ 0x0044444400444444ULL, 0x00fdfdfd00fdfdfdULL, 0x0088888800888888ULL,
+ 0x009f9f9f009f9f9fULL, 0x0065656500656565ULL, 0x0087878700878787ULL,
+ 0x006b6b6b006b6b6bULL, 0x00f4f4f400f4f4f4ULL, 0x0023232300232323ULL,
+ 0x0048484800484848ULL, 0x0010101000101010ULL, 0x00d1d1d100d1d1d1ULL,
+ 0x0051515100515151ULL, 0x00c0c0c000c0c0c0ULL, 0x00f9f9f900f9f9f9ULL,
+ 0x00d2d2d200d2d2d2ULL, 0x00a0a0a000a0a0a0ULL, 0x0055555500555555ULL,
+ 0x00a1a1a100a1a1a1ULL, 0x0041414100414141ULL, 0x00fafafa00fafafaULL,
+ 0x0043434300434343ULL, 0x0013131300131313ULL, 0x00c4c4c400c4c4c4ULL,
+ 0x002f2f2f002f2f2fULL, 0x00a8a8a800a8a8a8ULL, 0x00b6b6b600b6b6b6ULL,
+ 0x003c3c3c003c3c3cULL, 0x002b2b2b002b2b2bULL, 0x00c1c1c100c1c1c1ULL,
+ 0x00ffffff00ffffffULL, 0x00c8c8c800c8c8c8ULL, 0x00a5a5a500a5a5a5ULL,
+ 0x0020202000202020ULL, 0x0089898900898989ULL, 0x0000000000000000ULL,
+ 0x0090909000909090ULL, 0x0047474700474747ULL, 0x00efefef00efefefULL,
+ 0x00eaeaea00eaeaeaULL, 0x00b7b7b700b7b7b7ULL, 0x0015151500151515ULL,
+ 0x0006060600060606ULL, 0x00cdcdcd00cdcdcdULL, 0x00b5b5b500b5b5b5ULL,
+ 0x0012121200121212ULL, 0x007e7e7e007e7e7eULL, 0x00bbbbbb00bbbbbbULL,
+ 0x0029292900292929ULL, 0x000f0f0f000f0f0fULL, 0x00b8b8b800b8b8b8ULL,
+ 0x0007070700070707ULL, 0x0004040400040404ULL, 0x009b9b9b009b9b9bULL,
+ 0x0094949400949494ULL, 0x0021212100212121ULL, 0x0066666600666666ULL,
+ 0x00e6e6e600e6e6e6ULL, 0x00cecece00cececeULL, 0x00ededed00edededULL,
+ 0x00e7e7e700e7e7e7ULL, 0x003b3b3b003b3b3bULL, 0x00fefefe00fefefeULL,
+ 0x007f7f7f007f7f7fULL, 0x00c5c5c500c5c5c5ULL, 0x00a4a4a400a4a4a4ULL,
+ 0x0037373700373737ULL, 0x00b1b1b100b1b1b1ULL, 0x004c4c4c004c4c4cULL,
+ 0x0091919100919191ULL, 0x006e6e6e006e6e6eULL, 0x008d8d8d008d8d8dULL,
+ 0x0076767600767676ULL, 0x0003030300030303ULL, 0x002d2d2d002d2d2dULL,
+ 0x00dedede00dededeULL, 0x0096969600969696ULL, 0x0026262600262626ULL,
+ 0x007d7d7d007d7d7dULL, 0x00c6c6c600c6c6c6ULL, 0x005c5c5c005c5c5cULL,
+ 0x00d3d3d300d3d3d3ULL, 0x00f2f2f200f2f2f2ULL, 0x004f4f4f004f4f4fULL,
+ 0x0019191900191919ULL, 0x003f3f3f003f3f3fULL, 0x00dcdcdc00dcdcdcULL,
+ 0x0079797900797979ULL, 0x001d1d1d001d1d1dULL, 0x0052525200525252ULL,
+ 0x00ebebeb00ebebebULL, 0x00f3f3f300f3f3f3ULL, 0x006d6d6d006d6d6dULL,
+ 0x005e5e5e005e5e5eULL, 0x00fbfbfb00fbfbfbULL, 0x0069696900696969ULL,
+ 0x00b2b2b200b2b2b2ULL, 0x00f0f0f000f0f0f0ULL, 0x0031313100313131ULL,
+ 0x000c0c0c000c0c0cULL, 0x00d4d4d400d4d4d4ULL, 0x00cfcfcf00cfcfcfULL,
+ 0x008c8c8c008c8c8cULL, 0x00e2e2e200e2e2e2ULL, 0x0075757500757575ULL,
+ 0x00a9a9a900a9a9a9ULL, 0x004a4a4a004a4a4aULL, 0x0057575700575757ULL,
+ 0x0084848400848484ULL, 0x0011111100111111ULL, 0x0045454500454545ULL,
+ 0x001b1b1b001b1b1bULL, 0x00f5f5f500f5f5f5ULL, 0x00e4e4e400e4e4e4ULL,
+ 0x000e0e0e000e0e0eULL, 0x0073737300737373ULL, 0x00aaaaaa00aaaaaaULL,
+ 0x00f1f1f100f1f1f1ULL, 0x00dddddd00ddddddULL, 0x0059595900595959ULL,
+ 0x0014141400141414ULL, 0x006c6c6c006c6c6cULL, 0x0092929200929292ULL,
+ 0x0054545400545454ULL, 0x00d0d0d000d0d0d0ULL, 0x0078787800787878ULL,
+ 0x0070707000707070ULL, 0x00e3e3e300e3e3e3ULL, 0x0049494900494949ULL,
+ 0x0080808000808080ULL, 0x0050505000505050ULL, 0x00a7a7a700a7a7a7ULL,
+ 0x00f6f6f600f6f6f6ULL, 0x0077777700777777ULL, 0x0093939300939393ULL,
+ 0x0086868600868686ULL, 0x0083838300838383ULL, 0x002a2a2a002a2a2aULL,
+ 0x00c7c7c700c7c7c7ULL, 0x005b5b5b005b5b5bULL, 0x00e9e9e900e9e9e9ULL,
+ 0x00eeeeee00eeeeeeULL, 0x008f8f8f008f8f8fULL, 0x0001010100010101ULL,
+ 0x003d3d3d003d3d3dULL,
+};
+
+__visible const u64 camellia_sp30333033[256] = {
+ 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
+ 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
+ 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
+ 0xc200c2c2c200c2c2ULL, 0xab00ababab00ababULL, 0x9a009a9a9a009a9aULL,
+ 0x7500757575007575ULL, 0x0600060606000606ULL, 0x5700575757005757ULL,
+ 0xa000a0a0a000a0a0ULL, 0x9100919191009191ULL, 0xf700f7f7f700f7f7ULL,
+ 0xb500b5b5b500b5b5ULL, 0xc900c9c9c900c9c9ULL, 0xa200a2a2a200a2a2ULL,
+ 0x8c008c8c8c008c8cULL, 0xd200d2d2d200d2d2ULL, 0x9000909090009090ULL,
+ 0xf600f6f6f600f6f6ULL, 0x0700070707000707ULL, 0xa700a7a7a700a7a7ULL,
+ 0x2700272727002727ULL, 0x8e008e8e8e008e8eULL, 0xb200b2b2b200b2b2ULL,
+ 0x4900494949004949ULL, 0xde00dedede00dedeULL, 0x4300434343004343ULL,
+ 0x5c005c5c5c005c5cULL, 0xd700d7d7d700d7d7ULL, 0xc700c7c7c700c7c7ULL,
+ 0x3e003e3e3e003e3eULL, 0xf500f5f5f500f5f5ULL, 0x8f008f8f8f008f8fULL,
+ 0x6700676767006767ULL, 0x1f001f1f1f001f1fULL, 0x1800181818001818ULL,
+ 0x6e006e6e6e006e6eULL, 0xaf00afafaf00afafULL, 0x2f002f2f2f002f2fULL,
+ 0xe200e2e2e200e2e2ULL, 0x8500858585008585ULL, 0x0d000d0d0d000d0dULL,
+ 0x5300535353005353ULL, 0xf000f0f0f000f0f0ULL, 0x9c009c9c9c009c9cULL,
+ 0x6500656565006565ULL, 0xea00eaeaea00eaeaULL, 0xa300a3a3a300a3a3ULL,
+ 0xae00aeaeae00aeaeULL, 0x9e009e9e9e009e9eULL, 0xec00ececec00ececULL,
+ 0x8000808080008080ULL, 0x2d002d2d2d002d2dULL, 0x6b006b6b6b006b6bULL,
+ 0xa800a8a8a800a8a8ULL, 0x2b002b2b2b002b2bULL, 0x3600363636003636ULL,
+ 0xa600a6a6a600a6a6ULL, 0xc500c5c5c500c5c5ULL, 0x8600868686008686ULL,
+ 0x4d004d4d4d004d4dULL, 0x3300333333003333ULL, 0xfd00fdfdfd00fdfdULL,
+ 0x6600666666006666ULL, 0x5800585858005858ULL, 0x9600969696009696ULL,
+ 0x3a003a3a3a003a3aULL, 0x0900090909000909ULL, 0x9500959595009595ULL,
+ 0x1000101010001010ULL, 0x7800787878007878ULL, 0xd800d8d8d800d8d8ULL,
+ 0x4200424242004242ULL, 0xcc00cccccc00ccccULL, 0xef00efefef00efefULL,
+ 0x2600262626002626ULL, 0xe500e5e5e500e5e5ULL, 0x6100616161006161ULL,
+ 0x1a001a1a1a001a1aULL, 0x3f003f3f3f003f3fULL, 0x3b003b3b3b003b3bULL,
+ 0x8200828282008282ULL, 0xb600b6b6b600b6b6ULL, 0xdb00dbdbdb00dbdbULL,
+ 0xd400d4d4d400d4d4ULL, 0x9800989898009898ULL, 0xe800e8e8e800e8e8ULL,
+ 0x8b008b8b8b008b8bULL, 0x0200020202000202ULL, 0xeb00ebebeb00ebebULL,
+ 0x0a000a0a0a000a0aULL, 0x2c002c2c2c002c2cULL, 0x1d001d1d1d001d1dULL,
+ 0xb000b0b0b000b0b0ULL, 0x6f006f6f6f006f6fULL, 0x8d008d8d8d008d8dULL,
+ 0x8800888888008888ULL, 0x0e000e0e0e000e0eULL, 0x1900191919001919ULL,
+ 0x8700878787008787ULL, 0x4e004e4e4e004e4eULL, 0x0b000b0b0b000b0bULL,
+ 0xa900a9a9a900a9a9ULL, 0x0c000c0c0c000c0cULL, 0x7900797979007979ULL,
+ 0x1100111111001111ULL, 0x7f007f7f7f007f7fULL, 0x2200222222002222ULL,
+ 0xe700e7e7e700e7e7ULL, 0x5900595959005959ULL, 0xe100e1e1e100e1e1ULL,
+ 0xda00dadada00dadaULL, 0x3d003d3d3d003d3dULL, 0xc800c8c8c800c8c8ULL,
+ 0x1200121212001212ULL, 0x0400040404000404ULL, 0x7400747474007474ULL,
+ 0x5400545454005454ULL, 0x3000303030003030ULL, 0x7e007e7e7e007e7eULL,
+ 0xb400b4b4b400b4b4ULL, 0x2800282828002828ULL, 0x5500555555005555ULL,
+ 0x6800686868006868ULL, 0x5000505050005050ULL, 0xbe00bebebe00bebeULL,
+ 0xd000d0d0d000d0d0ULL, 0xc400c4c4c400c4c4ULL, 0x3100313131003131ULL,
+ 0xcb00cbcbcb00cbcbULL, 0x2a002a2a2a002a2aULL, 0xad00adadad00adadULL,
+ 0x0f000f0f0f000f0fULL, 0xca00cacaca00cacaULL, 0x7000707070007070ULL,
+ 0xff00ffffff00ffffULL, 0x3200323232003232ULL, 0x6900696969006969ULL,
+ 0x0800080808000808ULL, 0x6200626262006262ULL, 0x0000000000000000ULL,
+ 0x2400242424002424ULL, 0xd100d1d1d100d1d1ULL, 0xfb00fbfbfb00fbfbULL,
+ 0xba00bababa00babaULL, 0xed00ededed00ededULL, 0x4500454545004545ULL,
+ 0x8100818181008181ULL, 0x7300737373007373ULL, 0x6d006d6d6d006d6dULL,
+ 0x8400848484008484ULL, 0x9f009f9f9f009f9fULL, 0xee00eeeeee00eeeeULL,
+ 0x4a004a4a4a004a4aULL, 0xc300c3c3c300c3c3ULL, 0x2e002e2e2e002e2eULL,
+ 0xc100c1c1c100c1c1ULL, 0x0100010101000101ULL, 0xe600e6e6e600e6e6ULL,
+ 0x2500252525002525ULL, 0x4800484848004848ULL, 0x9900999999009999ULL,
+ 0xb900b9b9b900b9b9ULL, 0xb300b3b3b300b3b3ULL, 0x7b007b7b7b007b7bULL,
+ 0xf900f9f9f900f9f9ULL, 0xce00cecece00ceceULL, 0xbf00bfbfbf00bfbfULL,
+ 0xdf00dfdfdf00dfdfULL, 0x7100717171007171ULL, 0x2900292929002929ULL,
+ 0xcd00cdcdcd00cdcdULL, 0x6c006c6c6c006c6cULL, 0x1300131313001313ULL,
+ 0x6400646464006464ULL, 0x9b009b9b9b009b9bULL, 0x6300636363006363ULL,
+ 0x9d009d9d9d009d9dULL, 0xc000c0c0c000c0c0ULL, 0x4b004b4b4b004b4bULL,
+ 0xb700b7b7b700b7b7ULL, 0xa500a5a5a500a5a5ULL, 0x8900898989008989ULL,
+ 0x5f005f5f5f005f5fULL, 0xb100b1b1b100b1b1ULL, 0x1700171717001717ULL,
+ 0xf400f4f4f400f4f4ULL, 0xbc00bcbcbc00bcbcULL, 0xd300d3d3d300d3d3ULL,
+ 0x4600464646004646ULL, 0xcf00cfcfcf00cfcfULL, 0x3700373737003737ULL,
+ 0x5e005e5e5e005e5eULL, 0x4700474747004747ULL, 0x9400949494009494ULL,
+ 0xfa00fafafa00fafaULL, 0xfc00fcfcfc00fcfcULL, 0x5b005b5b5b005b5bULL,
+ 0x9700979797009797ULL, 0xfe00fefefe00fefeULL, 0x5a005a5a5a005a5aULL,
+ 0xac00acacac00acacULL, 0x3c003c3c3c003c3cULL, 0x4c004c4c4c004c4cULL,
+ 0x0300030303000303ULL, 0x3500353535003535ULL, 0xf300f3f3f300f3f3ULL,
+ 0x2300232323002323ULL, 0xb800b8b8b800b8b8ULL, 0x5d005d5d5d005d5dULL,
+ 0x6a006a6a6a006a6aULL, 0x9200929292009292ULL, 0xd500d5d5d500d5d5ULL,
+ 0x2100212121002121ULL, 0x4400444444004444ULL, 0x5100515151005151ULL,
+ 0xc600c6c6c600c6c6ULL, 0x7d007d7d7d007d7dULL, 0x3900393939003939ULL,
+ 0x8300838383008383ULL, 0xdc00dcdcdc00dcdcULL, 0xaa00aaaaaa00aaaaULL,
+ 0x7c007c7c7c007c7cULL, 0x7700777777007777ULL, 0x5600565656005656ULL,
+ 0x0500050505000505ULL, 0x1b001b1b1b001b1bULL, 0xa400a4a4a400a4a4ULL,
+ 0x1500151515001515ULL, 0x3400343434003434ULL, 0x1e001e1e1e001e1eULL,
+ 0x1c001c1c1c001c1cULL, 0xf800f8f8f800f8f8ULL, 0x5200525252005252ULL,
+ 0x2000202020002020ULL, 0x1400141414001414ULL, 0xe900e9e9e900e9e9ULL,
+ 0xbd00bdbdbd00bdbdULL, 0xdd00dddddd00ddddULL, 0xe400e4e4e400e4e4ULL,
+ 0xa100a1a1a100a1a1ULL, 0xe000e0e0e000e0e0ULL, 0x8a008a8a8a008a8aULL,
+ 0xf100f1f1f100f1f1ULL, 0xd600d6d6d600d6d6ULL, 0x7a007a7a7a007a7aULL,
+ 0xbb00bbbbbb00bbbbULL, 0xe300e3e3e300e3e3ULL, 0x4000404040004040ULL,
+ 0x4f004f4f4f004f4fULL,
+};
+
+__visible const u64 camellia_sp44044404[256] = {
+ 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
+ 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
+ 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
+ 0x6b6b006b6b6b006bULL, 0x4545004545450045ULL, 0xa5a500a5a5a500a5ULL,
+ 0xeded00ededed00edULL, 0x4f4f004f4f4f004fULL, 0x1d1d001d1d1d001dULL,
+ 0x9292009292920092ULL, 0x8686008686860086ULL, 0xafaf00afafaf00afULL,
+ 0x7c7c007c7c7c007cULL, 0x1f1f001f1f1f001fULL, 0x3e3e003e3e3e003eULL,
+ 0xdcdc00dcdcdc00dcULL, 0x5e5e005e5e5e005eULL, 0x0b0b000b0b0b000bULL,
+ 0xa6a600a6a6a600a6ULL, 0x3939003939390039ULL, 0xd5d500d5d5d500d5ULL,
+ 0x5d5d005d5d5d005dULL, 0xd9d900d9d9d900d9ULL, 0x5a5a005a5a5a005aULL,
+ 0x5151005151510051ULL, 0x6c6c006c6c6c006cULL, 0x8b8b008b8b8b008bULL,
+ 0x9a9a009a9a9a009aULL, 0xfbfb00fbfbfb00fbULL, 0xb0b000b0b0b000b0ULL,
+ 0x7474007474740074ULL, 0x2b2b002b2b2b002bULL, 0xf0f000f0f0f000f0ULL,
+ 0x8484008484840084ULL, 0xdfdf00dfdfdf00dfULL, 0xcbcb00cbcbcb00cbULL,
+ 0x3434003434340034ULL, 0x7676007676760076ULL, 0x6d6d006d6d6d006dULL,
+ 0xa9a900a9a9a900a9ULL, 0xd1d100d1d1d100d1ULL, 0x0404000404040004ULL,
+ 0x1414001414140014ULL, 0x3a3a003a3a3a003aULL, 0xdede00dedede00deULL,
+ 0x1111001111110011ULL, 0x3232003232320032ULL, 0x9c9c009c9c9c009cULL,
+ 0x5353005353530053ULL, 0xf2f200f2f2f200f2ULL, 0xfefe00fefefe00feULL,
+ 0xcfcf00cfcfcf00cfULL, 0xc3c300c3c3c300c3ULL, 0x7a7a007a7a7a007aULL,
+ 0x2424002424240024ULL, 0xe8e800e8e8e800e8ULL, 0x6060006060600060ULL,
+ 0x6969006969690069ULL, 0xaaaa00aaaaaa00aaULL, 0xa0a000a0a0a000a0ULL,
+ 0xa1a100a1a1a100a1ULL, 0x6262006262620062ULL, 0x5454005454540054ULL,
+ 0x1e1e001e1e1e001eULL, 0xe0e000e0e0e000e0ULL, 0x6464006464640064ULL,
+ 0x1010001010100010ULL, 0x0000000000000000ULL, 0xa3a300a3a3a300a3ULL,
+ 0x7575007575750075ULL, 0x8a8a008a8a8a008aULL, 0xe6e600e6e6e600e6ULL,
+ 0x0909000909090009ULL, 0xdddd00dddddd00ddULL, 0x8787008787870087ULL,
+ 0x8383008383830083ULL, 0xcdcd00cdcdcd00cdULL, 0x9090009090900090ULL,
+ 0x7373007373730073ULL, 0xf6f600f6f6f600f6ULL, 0x9d9d009d9d9d009dULL,
+ 0xbfbf00bfbfbf00bfULL, 0x5252005252520052ULL, 0xd8d800d8d8d800d8ULL,
+ 0xc8c800c8c8c800c8ULL, 0xc6c600c6c6c600c6ULL, 0x8181008181810081ULL,
+ 0x6f6f006f6f6f006fULL, 0x1313001313130013ULL, 0x6363006363630063ULL,
+ 0xe9e900e9e9e900e9ULL, 0xa7a700a7a7a700a7ULL, 0x9f9f009f9f9f009fULL,
+ 0xbcbc00bcbcbc00bcULL, 0x2929002929290029ULL, 0xf9f900f9f9f900f9ULL,
+ 0x2f2f002f2f2f002fULL, 0xb4b400b4b4b400b4ULL, 0x7878007878780078ULL,
+ 0x0606000606060006ULL, 0xe7e700e7e7e700e7ULL, 0x7171007171710071ULL,
+ 0xd4d400d4d4d400d4ULL, 0xabab00ababab00abULL, 0x8888008888880088ULL,
+ 0x8d8d008d8d8d008dULL, 0x7272007272720072ULL, 0xb9b900b9b9b900b9ULL,
+ 0xf8f800f8f8f800f8ULL, 0xacac00acacac00acULL, 0x3636003636360036ULL,
+ 0x2a2a002a2a2a002aULL, 0x3c3c003c3c3c003cULL, 0xf1f100f1f1f100f1ULL,
+ 0x4040004040400040ULL, 0xd3d300d3d3d300d3ULL, 0xbbbb00bbbbbb00bbULL,
+ 0x4343004343430043ULL, 0x1515001515150015ULL, 0xadad00adadad00adULL,
+ 0x7777007777770077ULL, 0x8080008080800080ULL, 0x8282008282820082ULL,
+ 0xecec00ececec00ecULL, 0x2727002727270027ULL, 0xe5e500e5e5e500e5ULL,
+ 0x8585008585850085ULL, 0x3535003535350035ULL, 0x0c0c000c0c0c000cULL,
+ 0x4141004141410041ULL, 0xefef00efefef00efULL, 0x9393009393930093ULL,
+ 0x1919001919190019ULL, 0x2121002121210021ULL, 0x0e0e000e0e0e000eULL,
+ 0x4e4e004e4e4e004eULL, 0x6565006565650065ULL, 0xbdbd00bdbdbd00bdULL,
+ 0xb8b800b8b8b800b8ULL, 0x8f8f008f8f8f008fULL, 0xebeb00ebebeb00ebULL,
+ 0xcece00cecece00ceULL, 0x3030003030300030ULL, 0x5f5f005f5f5f005fULL,
+ 0xc5c500c5c5c500c5ULL, 0x1a1a001a1a1a001aULL, 0xe1e100e1e1e100e1ULL,
+ 0xcaca00cacaca00caULL, 0x4747004747470047ULL, 0x3d3d003d3d3d003dULL,
+ 0x0101000101010001ULL, 0xd6d600d6d6d600d6ULL, 0x5656005656560056ULL,
+ 0x4d4d004d4d4d004dULL, 0x0d0d000d0d0d000dULL, 0x6666006666660066ULL,
+ 0xcccc00cccccc00ccULL, 0x2d2d002d2d2d002dULL, 0x1212001212120012ULL,
+ 0x2020002020200020ULL, 0xb1b100b1b1b100b1ULL, 0x9999009999990099ULL,
+ 0x4c4c004c4c4c004cULL, 0xc2c200c2c2c200c2ULL, 0x7e7e007e7e7e007eULL,
+ 0x0505000505050005ULL, 0xb7b700b7b7b700b7ULL, 0x3131003131310031ULL,
+ 0x1717001717170017ULL, 0xd7d700d7d7d700d7ULL, 0x5858005858580058ULL,
+ 0x6161006161610061ULL, 0x1b1b001b1b1b001bULL, 0x1c1c001c1c1c001cULL,
+ 0x0f0f000f0f0f000fULL, 0x1616001616160016ULL, 0x1818001818180018ULL,
+ 0x2222002222220022ULL, 0x4444004444440044ULL, 0xb2b200b2b2b200b2ULL,
+ 0xb5b500b5b5b500b5ULL, 0x9191009191910091ULL, 0x0808000808080008ULL,
+ 0xa8a800a8a8a800a8ULL, 0xfcfc00fcfcfc00fcULL, 0x5050005050500050ULL,
+ 0xd0d000d0d0d000d0ULL, 0x7d7d007d7d7d007dULL, 0x8989008989890089ULL,
+ 0x9797009797970097ULL, 0x5b5b005b5b5b005bULL, 0x9595009595950095ULL,
+ 0xffff00ffffff00ffULL, 0xd2d200d2d2d200d2ULL, 0xc4c400c4c4c400c4ULL,
+ 0x4848004848480048ULL, 0xf7f700f7f7f700f7ULL, 0xdbdb00dbdbdb00dbULL,
+ 0x0303000303030003ULL, 0xdada00dadada00daULL, 0x3f3f003f3f3f003fULL,
+ 0x9494009494940094ULL, 0x5c5c005c5c5c005cULL, 0x0202000202020002ULL,
+ 0x4a4a004a4a4a004aULL, 0x3333003333330033ULL, 0x6767006767670067ULL,
+ 0xf3f300f3f3f300f3ULL, 0x7f7f007f7f7f007fULL, 0xe2e200e2e2e200e2ULL,
+ 0x9b9b009b9b9b009bULL, 0x2626002626260026ULL, 0x3737003737370037ULL,
+ 0x3b3b003b3b3b003bULL, 0x9696009696960096ULL, 0x4b4b004b4b4b004bULL,
+ 0xbebe00bebebe00beULL, 0x2e2e002e2e2e002eULL, 0x7979007979790079ULL,
+ 0x8c8c008c8c8c008cULL, 0x6e6e006e6e6e006eULL, 0x8e8e008e8e8e008eULL,
+ 0xf5f500f5f5f500f5ULL, 0xb6b600b6b6b600b6ULL, 0xfdfd00fdfdfd00fdULL,
+ 0x5959005959590059ULL, 0x9898009898980098ULL, 0x6a6a006a6a6a006aULL,
+ 0x4646004646460046ULL, 0xbaba00bababa00baULL, 0x2525002525250025ULL,
+ 0x4242004242420042ULL, 0xa2a200a2a2a200a2ULL, 0xfafa00fafafa00faULL,
+ 0x0707000707070007ULL, 0x5555005555550055ULL, 0xeeee00eeeeee00eeULL,
+ 0x0a0a000a0a0a000aULL, 0x4949004949490049ULL, 0x6868006868680068ULL,
+ 0x3838003838380038ULL, 0xa4a400a4a4a400a4ULL, 0x2828002828280028ULL,
+ 0x7b7b007b7b7b007bULL, 0xc9c900c9c9c900c9ULL, 0xc1c100c1c1c100c1ULL,
+ 0xe3e300e3e3e300e3ULL, 0xf4f400f4f4f400f4ULL, 0xc7c700c7c7c700c7ULL,
+ 0x9e9e009e9e9e009eULL,
+};
+
+__visible const u64 camellia_sp11101110[256] = {
+ 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
+ 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
+ 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
+ 0x8585850085858500ULL, 0x5757570057575700ULL, 0x3535350035353500ULL,
+ 0xeaeaea00eaeaea00ULL, 0x0c0c0c000c0c0c00ULL, 0xaeaeae00aeaeae00ULL,
+ 0x4141410041414100ULL, 0x2323230023232300ULL, 0xefefef00efefef00ULL,
+ 0x6b6b6b006b6b6b00ULL, 0x9393930093939300ULL, 0x4545450045454500ULL,
+ 0x1919190019191900ULL, 0xa5a5a500a5a5a500ULL, 0x2121210021212100ULL,
+ 0xededed00ededed00ULL, 0x0e0e0e000e0e0e00ULL, 0x4f4f4f004f4f4f00ULL,
+ 0x4e4e4e004e4e4e00ULL, 0x1d1d1d001d1d1d00ULL, 0x6565650065656500ULL,
+ 0x9292920092929200ULL, 0xbdbdbd00bdbdbd00ULL, 0x8686860086868600ULL,
+ 0xb8b8b800b8b8b800ULL, 0xafafaf00afafaf00ULL, 0x8f8f8f008f8f8f00ULL,
+ 0x7c7c7c007c7c7c00ULL, 0xebebeb00ebebeb00ULL, 0x1f1f1f001f1f1f00ULL,
+ 0xcecece00cecece00ULL, 0x3e3e3e003e3e3e00ULL, 0x3030300030303000ULL,
+ 0xdcdcdc00dcdcdc00ULL, 0x5f5f5f005f5f5f00ULL, 0x5e5e5e005e5e5e00ULL,
+ 0xc5c5c500c5c5c500ULL, 0x0b0b0b000b0b0b00ULL, 0x1a1a1a001a1a1a00ULL,
+ 0xa6a6a600a6a6a600ULL, 0xe1e1e100e1e1e100ULL, 0x3939390039393900ULL,
+ 0xcacaca00cacaca00ULL, 0xd5d5d500d5d5d500ULL, 0x4747470047474700ULL,
+ 0x5d5d5d005d5d5d00ULL, 0x3d3d3d003d3d3d00ULL, 0xd9d9d900d9d9d900ULL,
+ 0x0101010001010100ULL, 0x5a5a5a005a5a5a00ULL, 0xd6d6d600d6d6d600ULL,
+ 0x5151510051515100ULL, 0x5656560056565600ULL, 0x6c6c6c006c6c6c00ULL,
+ 0x4d4d4d004d4d4d00ULL, 0x8b8b8b008b8b8b00ULL, 0x0d0d0d000d0d0d00ULL,
+ 0x9a9a9a009a9a9a00ULL, 0x6666660066666600ULL, 0xfbfbfb00fbfbfb00ULL,
+ 0xcccccc00cccccc00ULL, 0xb0b0b000b0b0b000ULL, 0x2d2d2d002d2d2d00ULL,
+ 0x7474740074747400ULL, 0x1212120012121200ULL, 0x2b2b2b002b2b2b00ULL,
+ 0x2020200020202000ULL, 0xf0f0f000f0f0f000ULL, 0xb1b1b100b1b1b100ULL,
+ 0x8484840084848400ULL, 0x9999990099999900ULL, 0xdfdfdf00dfdfdf00ULL,
+ 0x4c4c4c004c4c4c00ULL, 0xcbcbcb00cbcbcb00ULL, 0xc2c2c200c2c2c200ULL,
+ 0x3434340034343400ULL, 0x7e7e7e007e7e7e00ULL, 0x7676760076767600ULL,
+ 0x0505050005050500ULL, 0x6d6d6d006d6d6d00ULL, 0xb7b7b700b7b7b700ULL,
+ 0xa9a9a900a9a9a900ULL, 0x3131310031313100ULL, 0xd1d1d100d1d1d100ULL,
+ 0x1717170017171700ULL, 0x0404040004040400ULL, 0xd7d7d700d7d7d700ULL,
+ 0x1414140014141400ULL, 0x5858580058585800ULL, 0x3a3a3a003a3a3a00ULL,
+ 0x6161610061616100ULL, 0xdedede00dedede00ULL, 0x1b1b1b001b1b1b00ULL,
+ 0x1111110011111100ULL, 0x1c1c1c001c1c1c00ULL, 0x3232320032323200ULL,
+ 0x0f0f0f000f0f0f00ULL, 0x9c9c9c009c9c9c00ULL, 0x1616160016161600ULL,
+ 0x5353530053535300ULL, 0x1818180018181800ULL, 0xf2f2f200f2f2f200ULL,
+ 0x2222220022222200ULL, 0xfefefe00fefefe00ULL, 0x4444440044444400ULL,
+ 0xcfcfcf00cfcfcf00ULL, 0xb2b2b200b2b2b200ULL, 0xc3c3c300c3c3c300ULL,
+ 0xb5b5b500b5b5b500ULL, 0x7a7a7a007a7a7a00ULL, 0x9191910091919100ULL,
+ 0x2424240024242400ULL, 0x0808080008080800ULL, 0xe8e8e800e8e8e800ULL,
+ 0xa8a8a800a8a8a800ULL, 0x6060600060606000ULL, 0xfcfcfc00fcfcfc00ULL,
+ 0x6969690069696900ULL, 0x5050500050505000ULL, 0xaaaaaa00aaaaaa00ULL,
+ 0xd0d0d000d0d0d000ULL, 0xa0a0a000a0a0a000ULL, 0x7d7d7d007d7d7d00ULL,
+ 0xa1a1a100a1a1a100ULL, 0x8989890089898900ULL, 0x6262620062626200ULL,
+ 0x9797970097979700ULL, 0x5454540054545400ULL, 0x5b5b5b005b5b5b00ULL,
+ 0x1e1e1e001e1e1e00ULL, 0x9595950095959500ULL, 0xe0e0e000e0e0e000ULL,
+ 0xffffff00ffffff00ULL, 0x6464640064646400ULL, 0xd2d2d200d2d2d200ULL,
+ 0x1010100010101000ULL, 0xc4c4c400c4c4c400ULL, 0x0000000000000000ULL,
+ 0x4848480048484800ULL, 0xa3a3a300a3a3a300ULL, 0xf7f7f700f7f7f700ULL,
+ 0x7575750075757500ULL, 0xdbdbdb00dbdbdb00ULL, 0x8a8a8a008a8a8a00ULL,
+ 0x0303030003030300ULL, 0xe6e6e600e6e6e600ULL, 0xdadada00dadada00ULL,
+ 0x0909090009090900ULL, 0x3f3f3f003f3f3f00ULL, 0xdddddd00dddddd00ULL,
+ 0x9494940094949400ULL, 0x8787870087878700ULL, 0x5c5c5c005c5c5c00ULL,
+ 0x8383830083838300ULL, 0x0202020002020200ULL, 0xcdcdcd00cdcdcd00ULL,
+ 0x4a4a4a004a4a4a00ULL, 0x9090900090909000ULL, 0x3333330033333300ULL,
+ 0x7373730073737300ULL, 0x6767670067676700ULL, 0xf6f6f600f6f6f600ULL,
+ 0xf3f3f300f3f3f300ULL, 0x9d9d9d009d9d9d00ULL, 0x7f7f7f007f7f7f00ULL,
+ 0xbfbfbf00bfbfbf00ULL, 0xe2e2e200e2e2e200ULL, 0x5252520052525200ULL,
+ 0x9b9b9b009b9b9b00ULL, 0xd8d8d800d8d8d800ULL, 0x2626260026262600ULL,
+ 0xc8c8c800c8c8c800ULL, 0x3737370037373700ULL, 0xc6c6c600c6c6c600ULL,
+ 0x3b3b3b003b3b3b00ULL, 0x8181810081818100ULL, 0x9696960096969600ULL,
+ 0x6f6f6f006f6f6f00ULL, 0x4b4b4b004b4b4b00ULL, 0x1313130013131300ULL,
+ 0xbebebe00bebebe00ULL, 0x6363630063636300ULL, 0x2e2e2e002e2e2e00ULL,
+ 0xe9e9e900e9e9e900ULL, 0x7979790079797900ULL, 0xa7a7a700a7a7a700ULL,
+ 0x8c8c8c008c8c8c00ULL, 0x9f9f9f009f9f9f00ULL, 0x6e6e6e006e6e6e00ULL,
+ 0xbcbcbc00bcbcbc00ULL, 0x8e8e8e008e8e8e00ULL, 0x2929290029292900ULL,
+ 0xf5f5f500f5f5f500ULL, 0xf9f9f900f9f9f900ULL, 0xb6b6b600b6b6b600ULL,
+ 0x2f2f2f002f2f2f00ULL, 0xfdfdfd00fdfdfd00ULL, 0xb4b4b400b4b4b400ULL,
+ 0x5959590059595900ULL, 0x7878780078787800ULL, 0x9898980098989800ULL,
+ 0x0606060006060600ULL, 0x6a6a6a006a6a6a00ULL, 0xe7e7e700e7e7e700ULL,
+ 0x4646460046464600ULL, 0x7171710071717100ULL, 0xbababa00bababa00ULL,
+ 0xd4d4d400d4d4d400ULL, 0x2525250025252500ULL, 0xababab00ababab00ULL,
+ 0x4242420042424200ULL, 0x8888880088888800ULL, 0xa2a2a200a2a2a200ULL,
+ 0x8d8d8d008d8d8d00ULL, 0xfafafa00fafafa00ULL, 0x7272720072727200ULL,
+ 0x0707070007070700ULL, 0xb9b9b900b9b9b900ULL, 0x5555550055555500ULL,
+ 0xf8f8f800f8f8f800ULL, 0xeeeeee00eeeeee00ULL, 0xacacac00acacac00ULL,
+ 0x0a0a0a000a0a0a00ULL, 0x3636360036363600ULL, 0x4949490049494900ULL,
+ 0x2a2a2a002a2a2a00ULL, 0x6868680068686800ULL, 0x3c3c3c003c3c3c00ULL,
+ 0x3838380038383800ULL, 0xf1f1f100f1f1f100ULL, 0xa4a4a400a4a4a400ULL,
+ 0x4040400040404000ULL, 0x2828280028282800ULL, 0xd3d3d300d3d3d300ULL,
+ 0x7b7b7b007b7b7b00ULL, 0xbbbbbb00bbbbbb00ULL, 0xc9c9c900c9c9c900ULL,
+ 0x4343430043434300ULL, 0xc1c1c100c1c1c100ULL, 0x1515150015151500ULL,
+ 0xe3e3e300e3e3e300ULL, 0xadadad00adadad00ULL, 0xf4f4f400f4f4f400ULL,
+ 0x7777770077777700ULL, 0xc7c7c700c7c7c700ULL, 0x8080800080808000ULL,
+ 0x9e9e9e009e9e9e00ULL,
+};
+
+/* key constants */
+#define CAMELLIA_SIGMA1L (0xA09E667FL)
+#define CAMELLIA_SIGMA1R (0x3BCC908BL)
+#define CAMELLIA_SIGMA2L (0xB67AE858L)
+#define CAMELLIA_SIGMA2R (0x4CAA73B2L)
+#define CAMELLIA_SIGMA3L (0xC6EF372FL)
+#define CAMELLIA_SIGMA3R (0xE94F82BEL)
+#define CAMELLIA_SIGMA4L (0x54FF53A5L)
+#define CAMELLIA_SIGMA4R (0xF1D36F1CL)
+#define CAMELLIA_SIGMA5L (0x10E527FAL)
+#define CAMELLIA_SIGMA5R (0xDE682D1DL)
+#define CAMELLIA_SIGMA6L (0xB05688C2L)
+#define CAMELLIA_SIGMA6R (0xB3E6C1FDL)
+
+/* macros */
+#define ROLDQ(l, r, bits) ({ \
+ u64 t = l; \
+ l = (l << bits) | (r >> (64 - bits)); \
+ r = (r << bits) | (t >> (64 - bits)); \
+})
+
+#define CAMELLIA_F(x, kl, kr, y) ({ \
+ u64 ii = x ^ (((u64)kl << 32) | kr); \
+ y = camellia_sp11101110[(uint8_t)ii]; \
+ y ^= camellia_sp44044404[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp30333033[(uint8_t)ii]; \
+ y ^= camellia_sp02220222[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp00444404[(uint8_t)ii]; \
+ y ^= camellia_sp03303033[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp22000222[(uint8_t)ii]; \
+ y ^= camellia_sp10011110[(uint8_t)(ii >> 8)]; \
+ y = ror64(y, 32); \
+})
+
+#define SET_SUBKEY_LR(INDEX, sRL) (subkey[(INDEX)] = ror64((sRL), 32))
+
+static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
+{
+ u64 kw4, tt;
+ u32 dw, tl, tr;
+
+ /* absorb kw2 to other subkeys */
+ /* round 2 */
+ subRL[3] ^= subRL[1];
+ /* round 4 */
+ subRL[5] ^= subRL[1];
+ /* round 6 */
+ subRL[7] ^= subRL[1];
+
+ subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
+ /* modified for FLinv(kl2) */
+ dw = (subRL[1] & subRL[9]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 8 */
+ subRL[11] ^= subRL[1];
+ /* round 10 */
+ subRL[13] ^= subRL[1];
+ /* round 12 */
+ subRL[15] ^= subRL[1];
+
+ subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
+ /* modified for FLinv(kl4) */
+ dw = (subRL[1] & subRL[17]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 14 */
+ subRL[19] ^= subRL[1];
+ /* round 16 */
+ subRL[21] ^= subRL[1];
+ /* round 18 */
+ subRL[23] ^= subRL[1];
+
+ if (max == 24) {
+ /* kw3 */
+ subRL[24] ^= subRL[1];
+
+ /* absorb kw4 to other subkeys */
+ kw4 = subRL[25];
+ } else {
+ subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
+ /* modified for FLinv(kl6) */
+ dw = (subRL[1] & subRL[25]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 20 */
+ subRL[27] ^= subRL[1];
+ /* round 22 */
+ subRL[29] ^= subRL[1];
+ /* round 24 */
+ subRL[31] ^= subRL[1];
+ /* kw3 */
+ subRL[32] ^= subRL[1];
+
+ /* absorb kw4 to other subkeys */
+ kw4 = subRL[33];
+ /* round 23 */
+ subRL[30] ^= kw4;
+ /* round 21 */
+ subRL[28] ^= kw4;
+ /* round 19 */
+ subRL[26] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[24]) << 32;
+ /* modified for FL(kl5) */
+ dw = (kw4 & subRL[24]) >> 32;
+ kw4 ^= rol32(dw, 1);
+ }
+
+ /* round 17 */
+ subRL[22] ^= kw4;
+ /* round 15 */
+ subRL[20] ^= kw4;
+ /* round 13 */
+ subRL[18] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[16]) << 32;
+ /* modified for FL(kl3) */
+ dw = (kw4 & subRL[16]) >> 32;
+ kw4 ^= rol32(dw, 1);
+
+ /* round 11 */
+ subRL[14] ^= kw4;
+ /* round 9 */
+ subRL[12] ^= kw4;
+ /* round 7 */
+ subRL[10] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[8]) << 32;
+ /* modified for FL(kl1) */
+ dw = (kw4 & subRL[8]) >> 32;
+ kw4 ^= rol32(dw, 1);
+
+ /* round 5 */
+ subRL[6] ^= kw4;
+ /* round 3 */
+ subRL[4] ^= kw4;
+ /* round 1 */
+ subRL[2] ^= kw4;
+ /* kw1 */
+ subRL[0] ^= kw4;
+
+ /* key XOR is end of F-function */
+ SET_SUBKEY_LR(0, subRL[0] ^ subRL[2]); /* kw1 */
+ SET_SUBKEY_LR(2, subRL[3]); /* round 1 */
+ SET_SUBKEY_LR(3, subRL[2] ^ subRL[4]); /* round 2 */
+ SET_SUBKEY_LR(4, subRL[3] ^ subRL[5]); /* round 3 */
+ SET_SUBKEY_LR(5, subRL[4] ^ subRL[6]); /* round 4 */
+ SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */
+
+ tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
+ dw = tl & (subRL[8] >> 32); /* FL(kl1) */
+ tr = subRL[10] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */
+ SET_SUBKEY_LR(8, subRL[8]); /* FL(kl1) */
+ SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */
+
+ tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
+ dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */
+ tr = subRL[7] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */
+ SET_SUBKEY_LR(11, subRL[10] ^ subRL[12]); /* round 8 */
+ SET_SUBKEY_LR(12, subRL[11] ^ subRL[13]); /* round 9 */
+ SET_SUBKEY_LR(13, subRL[12] ^ subRL[14]); /* round 10 */
+ SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */
+
+ tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
+ dw = tl & (subRL[16] >> 32); /* FL(kl3) */
+ tr = subRL[18] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */
+ SET_SUBKEY_LR(16, subRL[16]); /* FL(kl3) */
+ SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */
+
+ tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
+ dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */
+ tr = subRL[15] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */
+ SET_SUBKEY_LR(19, subRL[18] ^ subRL[20]); /* round 14 */
+ SET_SUBKEY_LR(20, subRL[19] ^ subRL[21]); /* round 15 */
+ SET_SUBKEY_LR(21, subRL[20] ^ subRL[22]); /* round 16 */
+ SET_SUBKEY_LR(22, subRL[21] ^ subRL[23]); /* round 17 */
+
+ if (max == 24) {
+ SET_SUBKEY_LR(23, subRL[22]); /* round 18 */
+ SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */
+ } else {
+ tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
+ dw = tl & (subRL[24] >> 32); /* FL(kl5) */
+ tr = subRL[26] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */
+ SET_SUBKEY_LR(24, subRL[24]); /* FL(kl5) */
+ SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */
+
+ tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
+ dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */
+ tr = subRL[23] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */
+ SET_SUBKEY_LR(27, subRL[26] ^ subRL[28]); /* round 20 */
+ SET_SUBKEY_LR(28, subRL[27] ^ subRL[29]); /* round 21 */
+ SET_SUBKEY_LR(29, subRL[28] ^ subRL[30]); /* round 22 */
+ SET_SUBKEY_LR(30, subRL[29] ^ subRL[31]); /* round 23 */
+ SET_SUBKEY_LR(31, subRL[30]); /* round 24 */
+ SET_SUBKEY_LR(32, subRL[32] ^ subRL[31]); /* kw3 */
+ }
+}
+
+static void camellia_setup128(const unsigned char *key, u64 *subkey)
+{
+ u64 kl, kr, ww;
+ u64 subRL[26];
+
+ /**
+ * k == kl || kr (|| is concatenation)
+ */
+ kl = get_unaligned_be64(key);
+ kr = get_unaligned_be64(key + 8);
+
+ /* generate KL dependent subkeys */
+ /* kw1 */
+ subRL[0] = kl;
+ /* kw2 */
+ subRL[1] = kr;
+
+ /* rotation left shift 15bit */
+ ROLDQ(kl, kr, 15);
+
+ /* k3 */
+ subRL[4] = kl;
+ /* k4 */
+ subRL[5] = kr;
+
+ /* rotation left shift 15+30bit */
+ ROLDQ(kl, kr, 30);
+
+ /* k7 */
+ subRL[10] = kl;
+ /* k8 */
+ subRL[11] = kr;
+
+ /* rotation left shift 15+30+15bit */
+ ROLDQ(kl, kr, 15);
+
+ /* k10 */
+ subRL[13] = kr;
+ /* rotation left shift 15+30+15+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* kl3 */
+ subRL[16] = kl;
+ /* kl4 */
+ subRL[17] = kr;
+
+ /* rotation left shift 15+30+15+17+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* k13 */
+ subRL[18] = kl;
+ /* k14 */
+ subRL[19] = kr;
+
+ /* rotation left shift 15+30+15+17+17+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* k17 */
+ subRL[22] = kl;
+ /* k18 */
+ subRL[23] = kr;
+
+ /* generate KA */
+ kl = subRL[0];
+ kr = subRL[1];
+ CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+
+ /* current status == (kll, klr, w0, w1) */
+ CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+ kl ^= ww;
+
+ /* generate KA dependent subkeys */
+ /* k1, k2 */
+ subRL[2] = kl;
+ subRL[3] = kr;
+ ROLDQ(kl, kr, 15);
+ /* k5,k6 */
+ subRL[6] = kl;
+ subRL[7] = kr;
+ ROLDQ(kl, kr, 15);
+ /* kl1, kl2 */
+ subRL[8] = kl;
+ subRL[9] = kr;
+ ROLDQ(kl, kr, 15);
+ /* k9 */
+ subRL[12] = kl;
+ ROLDQ(kl, kr, 15);
+ /* k11, k12 */
+ subRL[14] = kl;
+ subRL[15] = kr;
+ ROLDQ(kl, kr, 34);
+ /* k15, k16 */
+ subRL[20] = kl;
+ subRL[21] = kr;
+ ROLDQ(kl, kr, 17);
+ /* kw3, kw4 */
+ subRL[24] = kl;
+ subRL[25] = kr;
+
+ camellia_setup_tail(subkey, subRL, 24);
+}
+
+static void camellia_setup256(const unsigned char *key, u64 *subkey)
+{
+ u64 kl, kr; /* left half of key */
+ u64 krl, krr; /* right half of key */
+ u64 ww; /* temporary variables */
+ u64 subRL[34];
+
+ /**
+ * key = (kl || kr || krl || krr) (|| is concatenation)
+ */
+ kl = get_unaligned_be64(key);
+ kr = get_unaligned_be64(key + 8);
+ krl = get_unaligned_be64(key + 16);
+ krr = get_unaligned_be64(key + 24);
+
+ /* generate KL dependent subkeys */
+ /* kw1 */
+ subRL[0] = kl;
+ /* kw2 */
+ subRL[1] = kr;
+ ROLDQ(kl, kr, 45);
+ /* k9 */
+ subRL[12] = kl;
+ /* k10 */
+ subRL[13] = kr;
+ ROLDQ(kl, kr, 15);
+ /* kl3 */
+ subRL[16] = kl;
+ /* kl4 */
+ subRL[17] = kr;
+ ROLDQ(kl, kr, 17);
+ /* k17 */
+ subRL[22] = kl;
+ /* k18 */
+ subRL[23] = kr;
+ ROLDQ(kl, kr, 34);
+ /* k23 */
+ subRL[30] = kl;
+ /* k24 */
+ subRL[31] = kr;
+
+ /* generate KR dependent subkeys */
+ ROLDQ(krl, krr, 15);
+ /* k3 */
+ subRL[4] = krl;
+ /* k4 */
+ subRL[5] = krr;
+ ROLDQ(krl, krr, 15);
+ /* kl1 */
+ subRL[8] = krl;
+ /* kl2 */
+ subRL[9] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k13 */
+ subRL[18] = krl;
+ /* k14 */
+ subRL[19] = krr;
+ ROLDQ(krl, krr, 34);
+ /* k19 */
+ subRL[26] = krl;
+ /* k20 */
+ subRL[27] = krr;
+ ROLDQ(krl, krr, 34);
+
+ /* generate KA */
+ kl = subRL[0] ^ krl;
+ kr = subRL[1] ^ krr;
+
+ CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+ kl ^= krl;
+ CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+ kr ^= ww ^ krr;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+ kl ^= ww;
+
+ /* generate KB */
+ krl ^= kl;
+ krr ^= kr;
+ CAMELLIA_F(krl, CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R, ww);
+ krr ^= ww;
+ CAMELLIA_F(krr, CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R, ww);
+ krl ^= ww;
+
+ /* generate KA dependent subkeys */
+ ROLDQ(kl, kr, 15);
+ /* k5 */
+ subRL[6] = kl;
+ /* k6 */
+ subRL[7] = kr;
+ ROLDQ(kl, kr, 30);
+ /* k11 */
+ subRL[14] = kl;
+ /* k12 */
+ subRL[15] = kr;
+ /* rotation left shift 32bit */
+ ROLDQ(kl, kr, 32);
+ /* kl5 */
+ subRL[24] = kl;
+ /* kl6 */
+ subRL[25] = kr;
+ /* rotation left shift 17 from k11,k12 -> k21,k22 */
+ ROLDQ(kl, kr, 17);
+ /* k21 */
+ subRL[28] = kl;
+ /* k22 */
+ subRL[29] = kr;
+
+ /* generate KB dependent subkeys */
+ /* k1 */
+ subRL[2] = krl;
+ /* k2 */
+ subRL[3] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k7 */
+ subRL[10] = krl;
+ /* k8 */
+ subRL[11] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k15 */
+ subRL[20] = krl;
+ /* k16 */
+ subRL[21] = krr;
+ ROLDQ(krl, krr, 51);
+ /* kw3 */
+ subRL[32] = krl;
+ /* kw4 */
+ subRL[33] = krr;
+
+ camellia_setup_tail(subkey, subRL, 32);
+}
+
+static void camellia_setup192(const unsigned char *key, u64 *subkey)
+{
+ unsigned char kk[32];
+ u64 krl, krr;
+
+ memcpy(kk, key, 24);
+ memcpy((unsigned char *)&krl, key+16, 8);
+ krr = ~krl;
+ memcpy(kk+24, (unsigned char *)&krr, 8);
+ camellia_setup256(kk, subkey);
+}
+
+int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
+ unsigned int key_len, u32 *flags)
+{
+ if (key_len != 16 && key_len != 24 && key_len != 32) {
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ cctx->key_length = key_len;
+
+ switch (key_len) {
+ case 16:
+ camellia_setup128(key, cctx->key_table);
+ break;
+ case 24:
+ camellia_setup192(key, cctx->key_table);
+ break;
+ case 32:
+ camellia_setup256(key, cctx->key_table);
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__camellia_setkey);
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len,
+ &tfm->crt_flags);
+}
+
+static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ return camellia_setkey(&tfm->base, key, key_len);
+}
+
+void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s)
+{
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+ u128 iv = *src;
+
+ camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
+
+ u128_xor(&dst[1], &dst[1], &iv);
+}
+EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
+
+void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ if (dst != src)
+ *dst = *src;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
+}
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
+
+void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblks[2];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ }
+
+ le128_to_be128(&ctrblks[0], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[1], iv);
+ le128_inc(iv);
+
+ camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
+
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ecb = camellia_enc_blk_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = camellia_enc_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ctr = camellia_crypt_ctr_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = camellia_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ecb = camellia_dec_blk_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = camellia_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .cbc = camellia_decrypt_cbc_2way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = camellia_dec_blk }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&camellia_ctr, req);
+}
+
+static struct crypto_alg camellia_cipher_alg = {
+ .cra_name = "camellia",
+ .cra_driver_name = "camellia-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct camellia_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .cia_max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .cia_setkey = camellia_setkey,
+ .cia_encrypt = camellia_encrypt,
+ .cia_decrypt = camellia_decrypt
+ }
+ }
+};
+
+static struct skcipher_alg camellia_skcipher_algs[] = {
+ {
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(camellia)",
+ .base.cra_driver_name = "ctr-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .chunksize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, camellia-asm is slower than original assembler
+ * implementation because excessive uses of 64bit rotate and
+ * left-shifts (which are really slow on P4) needed to store and
+ * handle 128bit block in two 64bit registers.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "camellia-x86_64: performance on this CPU "
+ "would be suboptimal: disabling "
+ "camellia-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&camellia_cipher_alg);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(camellia_skcipher_algs,
+ ARRAY_SIZE(camellia_skcipher_algs));
+ if (err)
+ crypto_unregister_alg(&camellia_cipher_alg);
+
+ return err;
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&camellia_cipher_alg);
+ crypto_unregister_skciphers(camellia_skcipher_algs,
+ ARRAY_SIZE(camellia_skcipher_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/marvell/linux/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
new file mode 100644
index 0000000..dc55c33
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,563 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.file "cast5-avx-x86_64-asm_64.S"
+
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
+
+/* structure of crypto context */
+#define km 0
+#define kr (16*4)
+#define rr ((16*4)+16)
+
+/* s-boxes */
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
+
+/**********************************************************************
+ 16-way AVX cast5
+ **********************************************************************/
+#define CTX %r15
+
+#define RL1 %xmm0
+#define RR1 %xmm1
+#define RL2 %xmm2
+#define RR2 %xmm3
+#define RL3 %xmm4
+#define RR3 %xmm5
+#define RL4 %xmm6
+#define RR4 %xmm7
+
+#define RX %xmm8
+
+#define RKM %xmm9
+#define RKR %xmm10
+#define RKRF %xmm11
+#define RKRR %xmm12
+
+#define R32 %xmm13
+#define R1ST %xmm14
+
+#define RTMP %xmm15
+
+#define RID1 %rdi
+#define RID1d %edi
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RFS1 %r8
+#define RFS1d %r8d
+#define RFS2 %r9
+#define RFS2d %r9d
+#define RFS3 %r10
+#define RFS3d %r10d
+
+
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ shrq $16, src; \
+ movl s1(, RID1, 4), dst ## d; \
+ op1 s2(, RID2, 4), dst ## d; \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
+ op2 s3(, RID1, 4), dst ## d; \
+ op3 s4(, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F_head(a, x, gi1, gi2, op0) \
+ op0 a, RKM, x; \
+ vpslld RKRF, x, RTMP; \
+ vpsrld RKRR, x, x; \
+ vpor RTMP, x, x; \
+ \
+ vmovq x, gi1; \
+ vpextrq $1, x, gi2;
+
+#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
+ lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
+ lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
+ \
+ lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
+ shlq $32, RFS2; \
+ orq RFS1, RFS2; \
+ lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
+ shlq $32, RFS1; \
+ orq RFS1, RFS3; \
+ \
+ vmovq RFS2, x; \
+ vpinsrq $1, RFS3, x, x;
+
+#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
+ F_head(b1, RX, RGI1, RGI2, op0); \
+ F_head(b2, RX, RGI3, RGI4, op0); \
+ \
+ F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
+ F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
+ \
+ vpxor a1, RX, a1; \
+ vpxor a2, RTMP, a2;
+
+#define F1_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
+#define F2_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
+#define F3_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
+
+#define subround(a1, b1, a2, b2, f) \
+ F ## f ## _2(a1, b1, a2, b2);
+
+#define round(l, r, n, f) \
+ vbroadcastss (km+(4*n))(CTX), RKM; \
+ vpand R1ST, RKR, RKRF; \
+ vpsubq RKRF, R32, RKRR; \
+ vpsrldq $1, RKR, RKR; \
+ subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
+ subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
+
+#define enc_preload_rkr() \
+ vbroadcastss .L16_mask, RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor kr(CTX), RKR, RKR;
+
+#define dec_preload_rkr() \
+ vbroadcastss .L16_mask, RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor kr(CTX), RKR, RKR; \
+ vpshufb .Lbswap128_mask, RKR, RKR;
+
+#define transpose_2x4(x0, x1, t0, t1) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t1; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1;
+
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ \
+ transpose_2x4(x0, x1, t0, t1)
+
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
+ transpose_2x4(x0, x1, t0, t1) \
+ \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1;
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_iv_mask:
+ .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst4.16_mask, "aM", @progbits, 4
+.align 4
+.L16_mask:
+ .byte 16, 16, 16, 16
+.section .rodata.cst4.32_mask, "aM", @progbits, 4
+.align 4
+.L32_mask:
+ .byte 32, 0, 0, 0
+.section .rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
+.Lfirst_mask:
+ .byte 0x1f, 0, 0, 0
+
+.text
+
+.align 16
+__cast5_enc_blk16:
+ /* input:
+ * %rdi: ctx
+ * RL1: blocks 1 and 2
+ * RR1: blocks 3 and 4
+ * RL2: blocks 5 and 6
+ * RR2: blocks 7 and 8
+ * RL3: blocks 9 and 10
+ * RR3: blocks 11 and 12
+ * RL4: blocks 13 and 14
+ * RR4: blocks 15 and 16
+ * output:
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+ enc_preload_rkr();
+
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
+
+ round(RL, RR, 0, 1);
+ round(RR, RL, 1, 2);
+ round(RL, RR, 2, 3);
+ round(RR, RL, 3, 1);
+ round(RL, RR, 4, 2);
+ round(RR, RL, 5, 3);
+ round(RL, RR, 6, 1);
+ round(RR, RL, 7, 2);
+ round(RL, RR, 8, 3);
+ round(RR, RL, 9, 1);
+ round(RL, RR, 10, 2);
+ round(RR, RL, 11, 3);
+
+ movzbl rr(CTX), %eax;
+ testl %eax, %eax;
+ jnz .L__skip_enc;
+
+ round(RL, RR, 12, 1);
+ round(RR, RL, 13, 2);
+ round(RL, RR, 14, 3);
+ round(RR, RL, 15, 1);
+
+.L__skip_enc:
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask, RKM;
+
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
+
+ ret;
+ENDPROC(__cast5_enc_blk16)
+
+.align 16
+__cast5_dec_blk16:
+ /* input:
+ * %rdi: ctx
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ * output:
+ * RL1: decrypted blocks 1 and 2
+ * RR1: decrypted blocks 3 and 4
+ * RL2: decrypted blocks 5 and 6
+ * RR2: decrypted blocks 7 and 8
+ * RL3: decrypted blocks 9 and 10
+ * RR3: decrypted blocks 11 and 12
+ * RL4: decrypted blocks 13 and 14
+ * RR4: decrypted blocks 15 and 16
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+ dec_preload_rkr();
+
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
+
+ movzbl rr(CTX), %eax;
+ testl %eax, %eax;
+ jnz .L__skip_dec;
+
+ round(RL, RR, 15, 1);
+ round(RR, RL, 14, 3);
+ round(RL, RR, 13, 2);
+ round(RR, RL, 12, 1);
+
+.L__dec_tail:
+ round(RL, RR, 11, 3);
+ round(RR, RL, 10, 2);
+ round(RL, RR, 9, 1);
+ round(RR, RL, 8, 3);
+ round(RL, RR, 7, 2);
+ round(RR, RL, 6, 1);
+ round(RL, RR, 5, 3);
+ round(RR, RL, 4, 2);
+ round(RL, RR, 3, 1);
+ round(RR, RL, 2, 3);
+ round(RL, RR, 1, 2);
+ round(RR, RL, 0, 1);
+
+ vmovdqa .Lbswap_mask, RKM;
+ popq %rbx;
+ popq %r15;
+
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
+
+ ret;
+
+.L__skip_dec:
+ vpsrldq $4, RKR, RKR;
+ jmp .L__dec_tail;
+ENDPROC(__cast5_dec_blk16)
+
+ENTRY(cast5_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_enc_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast5_ecb_enc_16way)
+
+ENTRY(cast5_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast5_ecb_dec_16way)
+
+ENTRY(cast5_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vmovdqu (0*16)(%rdx), RL1;
+ vmovdqu (1*16)(%rdx), RR1;
+ vmovdqu (2*16)(%rdx), RL2;
+ vmovdqu (3*16)(%rdx), RR2;
+ vmovdqu (4*16)(%rdx), RL3;
+ vmovdqu (5*16)(%rdx), RR3;
+ vmovdqu (6*16)(%rdx), RL4;
+ vmovdqu (7*16)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ /* xor with src */
+ vmovq (%r12), RX;
+ vpshufd $0x4f, RX, RX;
+ vpxor RX, RR1, RR1;
+ vpxor 0*16+8(%r12), RL1, RL1;
+ vpxor 1*16+8(%r12), RR2, RR2;
+ vpxor 2*16+8(%r12), RL2, RL2;
+ vpxor 3*16+8(%r12), RR3, RR3;
+ vpxor 4*16+8(%r12), RL3, RL3;
+ vpxor 5*16+8(%r12), RR4, RR4;
+ vpxor 6*16+8(%r12), RL4, RL4;
+
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast5_cbc_dec_16way)
+
+ENTRY(cast5_ctr_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (big endian, 64bit)
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vpcmpeqd RTMP, RTMP, RTMP;
+ vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
+
+ vpcmpeqd RKR, RKR, RKR;
+ vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
+ vmovdqa .Lbswap_iv_mask, R1ST;
+ vmovdqa .Lbswap128_mask, RKM;
+
+ /* load IV and byteswap */
+ vmovq (%rcx), RX;
+ vpshufb R1ST, RX, RX;
+
+ /* construct IVs */
+ vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
+ vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
+
+ /* store last IV */
+ vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
+ vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
+ vmovq RX, (%rcx);
+
+ call __cast5_enc_blk16;
+
+ /* dst = src ^ iv */
+ vpxor (0*16)(%r12), RR1, RR1;
+ vpxor (1*16)(%r12), RL1, RL1;
+ vpxor (2*16)(%r12), RR2, RR2;
+ vpxor (3*16)(%r12), RL2, RL2;
+ vpxor (4*16)(%r12), RR3, RR3;
+ vpxor (5*16)(%r12), RL3, RL3;
+ vpxor (6*16)(%r12), RR4, RR4;
+ vpxor (7*16)(%r12), RL4, RL4;
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast5_ctr_16way)
diff --git a/marvell/linux/arch/x86/crypto/cast5_avx_glue.c b/marvell/linux/arch/x86/crypto/cast5_avx_glue.c
new file mode 100644
index 0000000..384ccb0
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/cast5_avx_glue.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for the AVX assembler implementation of the Cast5 Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ */
+
+#include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/cast5.h>
+#include <crypto/internal/simd.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CAST5_PARALLEL_BLOCKS 16
+
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
+ __be64 *iv);
+
+static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return cast5_setkey(&tfm->base, key, keylen);
+}
+
+static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk,
+ unsigned int nbytes)
+{
+ return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
+ walk, fpu_enabled, nbytes);
+}
+
+static inline void cast5_fpu_end(bool fpu_enabled)
+{
+ return glue_fpu_end(fpu_enabled);
+}
+
+static int ecb_crypt(struct skcipher_request *req, bool enc)
+{
+ bool fpu_enabled = false;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ unsigned int nbytes;
+ void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+ fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
+ do {
+ fn(ctx, wdst, wsrc);
+
+ wsrc += bsize * CAST5_PARALLEL_BLOCKS;
+ wdst += bsize * CAST5_PARALLEL_BLOCKS;
+ nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
+
+ /* Handle leftovers */
+ do {
+ fn(ctx, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ cast5_fpu_end(fpu_enabled);
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, true);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, false);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u64 *src = (u64 *)walk.src.virt.addr;
+ u64 *dst = (u64 *)walk.dst.virt.addr;
+ u64 *iv = (u64 *)walk.iv;
+
+ do {
+ *dst = *src ^ *iv;
+ __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+ src++;
+ dst++;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk.iv = *iv;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct cast5_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+ do {
+ nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
+ src -= CAST5_PARALLEL_BLOCKS - 1;
+ dst -= CAST5_PARALLEL_BLOCKS - 1;
+
+ cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ bool fpu_enabled = false;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ cast5_fpu_end(fpu_enabled);
+ return err;
+}
+
+static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[CAST5_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ __cast5_encrypt(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct skcipher_walk *walk,
+ struct cast5_ctx *ctx)
+{
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+ do {
+ cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
+ (__be64 *)walk->iv);
+
+ src += CAST5_PARALLEL_BLOCKS;
+ dst += CAST5_PARALLEL_BLOCKS;
+ nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ u64 ctrblk;
+
+ if (dst != src)
+ *dst = *src;
+
+ ctrblk = *(u64 *)walk->iv;
+ be64_add_cpu((__be64 *)walk->iv, 1);
+
+ __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ *dst ^= ctrblk;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ return nbytes;
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ bool fpu_enabled = false;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ nbytes = __ctr_crypt(&walk, ctx);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ cast5_fpu_end(fpu_enabled);
+
+ if (walk.nbytes) {
+ ctr_crypt_final(&walk, ctx);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg cast5_algs[] = {
+ {
+ .base.cra_name = "__ecb(cast5)",
+ .base.cra_driver_name = "__ecb-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST5_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(cast5)",
+ .base.cra_driver_name = "__cbc-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST5_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .ivsize = CAST5_BLOCK_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(cast5)",
+ .base.cra_driver_name = "__ctr-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .ivsize = CAST5_BLOCK_SIZE,
+ .chunksize = CAST5_BLOCK_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)];
+
+static int __init cast5_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(cast5_algs,
+ ARRAY_SIZE(cast5_algs),
+ cast5_simd_algs);
+}
+
+static void __exit cast5_exit(void)
+{
+ simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs),
+ cast5_simd_algs);
+}
+
+module_init(cast5_init);
+module_exit(cast5_exit);
+
+MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("cast5");
diff --git a/marvell/linux/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
new file mode 100644
index 0000000..4f0a7cd
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,496 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "cast6-avx-x86_64-asm_64.S"
+
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
+
+/* structure of crypto context */
+#define km 0
+#define kr (12*4*4)
+
+/* s-boxes */
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
+
+/**********************************************************************
+ 8-way AVX cast6
+ **********************************************************************/
+#define CTX %r15
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX %xmm8
+
+#define RKM %xmm9
+#define RKR %xmm10
+#define RKRF %xmm11
+#define RKRR %xmm12
+#define R32 %xmm13
+#define R1ST %xmm14
+
+#define RTMP %xmm15
+
+#define RID1 %rdi
+#define RID1d %edi
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RFS1 %r8
+#define RFS1d %r8d
+#define RFS2 %r9
+#define RFS2d %r9d
+#define RFS3 %r10
+#define RFS3d %r10d
+
+
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ shrq $16, src; \
+ movl s1(, RID1, 4), dst ## d; \
+ op1 s2(, RID2, 4), dst ## d; \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
+ op2 s3(, RID1, 4), dst ## d; \
+ op3 s4(, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F_head(a, x, gi1, gi2, op0) \
+ op0 a, RKM, x; \
+ vpslld RKRF, x, RTMP; \
+ vpsrld RKRR, x, x; \
+ vpor RTMP, x, x; \
+ \
+ vmovq x, gi1; \
+ vpextrq $1, x, gi2;
+
+#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
+ lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
+ lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
+ \
+ lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
+ shlq $32, RFS2; \
+ orq RFS1, RFS2; \
+ lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
+ shlq $32, RFS1; \
+ orq RFS1, RFS3; \
+ \
+ vmovq RFS2, x; \
+ vpinsrq $1, RFS3, x, x;
+
+#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
+ F_head(b1, RX, RGI1, RGI2, op0); \
+ F_head(b2, RX, RGI3, RGI4, op0); \
+ \
+ F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
+ F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
+ \
+ vpxor a1, RX, a1; \
+ vpxor a2, RTMP, a2;
+
+#define F1_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
+#define F2_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
+#define F3_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
+
+#define qop(in, out, f) \
+ F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
+
+#define get_round_keys(nn) \
+ vbroadcastss (km+(4*(nn)))(CTX), RKM; \
+ vpand R1ST, RKR, RKRF; \
+ vpsubq RKRF, R32, RKRR; \
+ vpsrldq $1, RKR, RKR;
+
+#define Q(n) \
+ get_round_keys(4*n+0); \
+ qop(RD, RC, 1); \
+ \
+ get_round_keys(4*n+1); \
+ qop(RC, RB, 2); \
+ \
+ get_round_keys(4*n+2); \
+ qop(RB, RA, 3); \
+ \
+ get_round_keys(4*n+3); \
+ qop(RA, RD, 1);
+
+#define QBAR(n) \
+ get_round_keys(4*n+3); \
+ qop(RA, RD, 1); \
+ \
+ get_round_keys(4*n+2); \
+ qop(RB, RA, 3); \
+ \
+ get_round_keys(4*n+1); \
+ qop(RC, RB, 2); \
+ \
+ get_round_keys(4*n+0); \
+ qop(RD, RC, 1);
+
+#define shuffle(mask) \
+ vpshufb mask, RKR, RKR;
+
+#define preload_rkr(n, do_mask, mask) \
+ vbroadcastss .L16_mask, RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor (kr+n*16)(CTX), RKR, RKR; \
+ do_mask(mask);
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3;
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lbswap_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lrkr_enc_Q_Q_QBAR_QBAR:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
+.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lrkr_dec_Q_Q_Q_Q:
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+.Lrkr_dec_Q_Q_QBAR_QBAR:
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
+.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst4.L16_mask, "aM", @progbits, 4
+.align 4
+.L16_mask:
+ .byte 16, 16, 16, 16
+
+.section .rodata.cst4.L32_mask, "aM", @progbits, 4
+.align 4
+.L32_mask:
+ .byte 32, 0, 0, 0
+
+.section .rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
+.Lfirst_mask:
+ .byte 0x1f, 0, 0, 0
+
+.text
+
+.align 8
+__cast6_enc_blk8:
+ /* input:
+ * %rdi: ctx
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ preload_rkr(0, dummy, none);
+ Q(0);
+ Q(1);
+ Q(2);
+ Q(3);
+ preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
+ Q(4);
+ Q(5);
+ QBAR(6);
+ QBAR(7);
+ preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
+ QBAR(8);
+ QBAR(9);
+ QBAR(10);
+ QBAR(11);
+
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask, RKM;
+
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ ret;
+ENDPROC(__cast6_enc_blk8)
+
+.align 8
+__cast6_dec_blk8:
+ /* input:
+ * %rdi: ctx
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
+ Q(11);
+ Q(10);
+ Q(9);
+ Q(8);
+ preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
+ Q(7);
+ Q(6);
+ QBAR(5);
+ QBAR(4);
+ preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
+ QBAR(3);
+ QBAR(2);
+ QBAR(1);
+ QBAR(0);
+
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask, RKM;
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ ret;
+ENDPROC(__cast6_dec_blk8)
+
+ENTRY(cast6_ecb_enc_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_enc_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_ecb_enc_8way)
+
+ENTRY(cast6_ecb_dec_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_ecb_dec_8way)
+
+ENTRY(cast6_cbc_dec_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast6_cbc_dec_8way)
+
+ENTRY(cast6_ctr_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RX, RKR, RKM);
+
+ call __cast6_enc_blk8;
+
+ store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast6_ctr_8way)
+
+ENTRY(cast6_xts_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+ call __cast6_enc_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_xts_enc_8way)
+
+ENTRY(cast6_xts_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+ call __cast6_dec_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_xts_dec_8way)
diff --git a/marvell/linux/arch/x86/crypto/cast6_avx_glue.c b/marvell/linux/arch/x86/crypto/cast6_avx_glue.c
new file mode 100644
index 0000000..da52974
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/cast6_avx_glue.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for the AVX assembler implementation of the Cast6 Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/cast6.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+
+#define CAST6_PARALLEL_BLOCKS 8
+
+asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return cast6_setkey(&tfm->base, key, keylen);
+}
+
+static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt);
+}
+
+static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt);
+}
+
+static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+
+static const struct common_glue_ctx cast6_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = cast6_ecb_enc_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __cast6_encrypt }
+ } }
+};
+
+static const struct common_glue_ctx cast6_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = cast6_ctr_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = cast6_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx cast6_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .xts = cast6_xts_enc_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = cast6_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx cast6_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = cast6_ecb_dec_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __cast6_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx cast6_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = cast6_cbc_dec_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = __cast6_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx cast6_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .xts = cast6_xts_dec_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = cast6_xts_dec }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&cast6_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&cast6_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&cast6_ctr, req);
+}
+
+struct cast6_xts_ctx {
+ struct cast6_ctx tweak_ctx;
+ struct cast6_ctx crypt_ctx;
+};
+
+static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *flags = &tfm->base.crt_flags;
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+}
+
+static struct skcipher_alg cast6_algs[] = {
+ {
+ .base.cra_name = "__ecb(cast6)",
+ .base.cra_driver_name = "__ecb-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(cast6)",
+ .base.cra_driver_name = "__cbc-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(cast6)",
+ .base.cra_driver_name = "__ctr-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .chunksize = CAST6_BLOCK_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(cast6)",
+ .base.cra_driver_name = "__xts-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * CAST6_MIN_KEY_SIZE,
+ .max_keysize = 2 * CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .setkey = xts_cast6_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)];
+
+static int __init cast6_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(cast6_algs,
+ ARRAY_SIZE(cast6_algs),
+ cast6_simd_algs);
+}
+
+static void __exit cast6_exit(void)
+{
+ simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs),
+ cast6_simd_algs);
+}
+
+module_init(cast6_init);
+module_exit(cast6_exit);
+
+MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("cast6");
diff --git a/marvell/linux/arch/x86/crypto/chacha-avx2-x86_64.S b/marvell/linux/arch/x86/crypto/chacha-avx2-x86_64.S
new file mode 100644
index 0000000..831e443
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+ .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+ .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.text
+
+ENTRY(chacha_2block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ vmovdqa ROT8(%rip),%ymm4
+ vmovdqa ROT16(%rip),%ymm5
+
+ mov %rcx,%rax
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rax
+ jl .Lxorpart2
+ vpxor 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rax
+ jl .Lxorpart2
+ vpxor 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rax
+ jl .Lxorpart2
+ vpxor 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rax
+ jl .Lxorpart2
+ vpxor 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rax
+ jl .Lxorpart2
+ vpxor 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rax
+ jl .Lxorpart2
+ vpxor 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rax
+ jl .Lxorpart2
+ vpxor 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rax
+ jl .Lxorpart2
+ vpxor 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone2
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm7,%xmm7
+ vmovdqa %xmm7,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone2
+
+ENDPROC(chacha_2block_xor_avx2)
+
+ENTRY(chacha_4block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ vmovdqa ROT8(%rip),%ymm8
+ vmovdqa ROT16(%rip),%ymm9
+
+ mov %rcx,%rax
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ vpxor 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ vpxor 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ vpxor 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ vpxor 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ vpxor 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ vpxor 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ vpxor 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ vpxor 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ vpxor 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ vpxor 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ vpxor 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ vpxor 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ vpxor 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ vpxor 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ vpxor 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ vpxor 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm10,%xmm10
+ vmovdqa %xmm10,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_avx2)
+
+ENTRY(chacha_8block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. As we need some
+ # scratch registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+ # words, which allows us to do XOR in AVX registers. 8/16-bit word
+ # rotation is done with the slightly better performing byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ vzeroupper
+ # 4 * 32 byte stack, 32-byte aligned
+ lea 8(%rsp),%r10
+ and $~31, %rsp
+ sub $0x80, %rsp
+ mov %rcx,%rax
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+ # x0..3 on stack
+ vmovdqa %ymm0,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm3,0x60(%rsp)
+
+ vmovdqa CTRINC(%rip),%ymm1
+ vmovdqa ROT8(%rip),%ymm2
+ vmovdqa ROT16(%rip),%ymm3
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpaddd 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpbroadcastd 0x04(%rdi),%ymm0
+ vpaddd 0x20(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpbroadcastd 0x08(%rdi),%ymm0
+ vpaddd 0x40(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpbroadcastd 0x0c(%rdi),%ymm0
+ vpaddd 0x60(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpbroadcastd 0x10(%rdi),%ymm0
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm0
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm0
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm7,%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm0
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm0
+ vpaddd %ymm0,%ymm9,%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm0
+ vpaddd %ymm0,%ymm10,%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm0
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm0
+ vpaddd %ymm0,%ymm13,%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm0
+ vpaddd %ymm0,%ymm14,%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm15,%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ # interleave 32-bit words in state n, n+1
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm1,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpckldq %ymm5,%ymm0,%ymm4
+ vpunpckhdq %ymm5,%ymm0,%ymm5
+ vmovdqa %ymm6,%ymm0
+ vpunpckldq %ymm7,%ymm0,%ymm6
+ vpunpckhdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpckldq %ymm9,%ymm0,%ymm8
+ vpunpckhdq %ymm9,%ymm0,%ymm9
+ vmovdqa %ymm10,%ymm0
+ vpunpckldq %ymm11,%ymm0,%ymm10
+ vpunpckhdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpckldq %ymm13,%ymm0,%ymm12
+ vpunpckhdq %ymm13,%ymm0,%ymm13
+ vmovdqa %ymm14,%ymm0
+ vpunpckldq %ymm15,%ymm0,%ymm14
+ vpunpckhdq %ymm15,%ymm0,%ymm15
+
+ # interleave 64-bit words in state n, n+2
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x40(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpcklqdq %ymm6,%ymm0,%ymm4
+ vpunpckhqdq %ymm6,%ymm0,%ymm6
+ vmovdqa %ymm5,%ymm0
+ vpunpcklqdq %ymm7,%ymm0,%ymm5
+ vpunpckhqdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpcklqdq %ymm10,%ymm0,%ymm8
+ vpunpckhqdq %ymm10,%ymm0,%ymm10
+ vmovdqa %ymm9,%ymm0
+ vpunpcklqdq %ymm11,%ymm0,%ymm9
+ vpunpckhqdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpcklqdq %ymm14,%ymm0,%ymm12
+ vpunpckhqdq %ymm14,%ymm0,%ymm14
+ vmovdqa %ymm13,%ymm0
+ vpunpcklqdq %ymm15,%ymm0,%ymm13
+ vpunpckhqdq %ymm15,%ymm0,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa 0x00(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
+ cmp $0x0020,%rax
+ jl .Lxorpart8
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rax
+ jl .Lxorpart8
+ vpxor 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vmovdqa 0x40(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
+ cmp $0x0060,%rax
+ jl .Lxorpart8
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rax
+ jl .Lxorpart8
+ vpxor 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vmovdqa 0x20(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rax
+ jl .Lxorpart8
+ vpxor 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vmovdqa 0x60(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
+ cmp $0x00e0,%rax
+ jl .Lxorpart8
+ vpxor 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rax
+ jl .Lxorpart8
+ vpxor 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa %ymm4,%ymm0
+ cmp $0x0120,%rax
+ jl .Lxorpart8
+ vpxor 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0100(%rsi)
+
+ vmovdqa %ymm12,%ymm0
+ cmp $0x0140,%rax
+ jl .Lxorpart8
+ vpxor 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0120(%rsi)
+
+ vmovdqa %ymm6,%ymm0
+ cmp $0x0160,%rax
+ jl .Lxorpart8
+ vpxor 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0140(%rsi)
+
+ vmovdqa %ymm14,%ymm0
+ cmp $0x0180,%rax
+ jl .Lxorpart8
+ vpxor 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0160(%rsi)
+
+ vmovdqa %ymm5,%ymm0
+ cmp $0x01a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0180(%rsi)
+
+ vmovdqa %ymm13,%ymm0
+ cmp $0x01c0,%rax
+ jl .Lxorpart8
+ vpxor 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01a0(%rsi)
+
+ vmovdqa %ymm7,%ymm0
+ cmp $0x01e0,%rax
+ jl .Lxorpart8
+ vpxor 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01c0(%rsi)
+
+ vmovdqa %ymm15,%ymm0
+ cmp $0x0200,%rax
+ jl .Lxorpart8
+ vpxor 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ lea -8(%r10),%rsp
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x1f,%r9
+ jz .Ldone8
+ and $~0x1f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone8
+
+ENDPROC(chacha_8block_xor_avx2)
diff --git a/marvell/linux/arch/x86/crypto/chacha-avx512vl-x86_64.S b/marvell/linux/arch/x86/crypto/chacha-avx512vl-x86_64.S
new file mode 100644
index 0000000..596f91e
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha_2block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rcx
+ jl .Lxorpart2
+ vpxord 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rcx
+ jl .Lxorpart2
+ vpxord 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rcx
+ jl .Lxorpart2
+ vpxord 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rcx
+ jl .Lxorpart2
+ vpxord 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rcx
+ jl .Lxorpart2
+ vpxord 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rcx
+ jl .Lxorpart2
+ vpxord 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rcx
+ jl .Lxorpart2
+ vpxord 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rcx
+ jl .Lxorpart2
+ vpxord 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone2
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm7,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone2
+
+ENDPROC(chacha_2block_xor_avx512vl)
+
+ENTRY(chacha_4block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rcx
+ jl .Lxorpart4
+ vpxord 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rcx
+ jl .Lxorpart4
+ vpxord 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rcx
+ jl .Lxorpart4
+ vpxord 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rcx
+ jl .Lxorpart4
+ vpxord 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rcx
+ jl .Lxorpart4
+ vpxord 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rcx
+ jl .Lxorpart4
+ vpxord 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rcx
+ jl .Lxorpart4
+ vpxord 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rcx
+ jl .Lxorpart4
+ vpxord 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rcx
+ jl .Lxorpart4
+ vpxord 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rcx
+ jl .Lxorpart4
+ vpxord 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rcx
+ jl .Lxorpart4
+ vpxord 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rcx
+ jl .Lxorpart4
+ vpxord 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rcx
+ jl .Lxorpart4
+ vpxord 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rcx
+ jl .Lxorpart4
+ vpxord 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rcx
+ jl .Lxorpart4
+ vpxord 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rcx
+ jl .Lxorpart4
+ vpxord 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone4
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm10,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_avx512vl)
+
+ENTRY(chacha_8block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. Compared to AVX2, this
+ # mostly benefits from the new rotate instructions in VL and the
+ # additional registers.
+
+ vzeroupper
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd CTR8BL(%rip),%ymm12,%ymm12
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ # interleave 32-bit words in state n, n+1
+ vpunpckldq %ymm1,%ymm0,%ymm16
+ vpunpckhdq %ymm1,%ymm0,%ymm17
+ vpunpckldq %ymm3,%ymm2,%ymm18
+ vpunpckhdq %ymm3,%ymm2,%ymm19
+ vpunpckldq %ymm5,%ymm4,%ymm20
+ vpunpckhdq %ymm5,%ymm4,%ymm21
+ vpunpckldq %ymm7,%ymm6,%ymm22
+ vpunpckhdq %ymm7,%ymm6,%ymm23
+ vpunpckldq %ymm9,%ymm8,%ymm24
+ vpunpckhdq %ymm9,%ymm8,%ymm25
+ vpunpckldq %ymm11,%ymm10,%ymm26
+ vpunpckhdq %ymm11,%ymm10,%ymm27
+ vpunpckldq %ymm13,%ymm12,%ymm28
+ vpunpckhdq %ymm13,%ymm12,%ymm29
+ vpunpckldq %ymm15,%ymm14,%ymm30
+ vpunpckhdq %ymm15,%ymm14,%ymm31
+
+ # interleave 64-bit words in state n, n+2
+ vpunpcklqdq %ymm18,%ymm16,%ymm0
+ vpunpcklqdq %ymm19,%ymm17,%ymm1
+ vpunpckhqdq %ymm18,%ymm16,%ymm2
+ vpunpckhqdq %ymm19,%ymm17,%ymm3
+ vpunpcklqdq %ymm22,%ymm20,%ymm4
+ vpunpcklqdq %ymm23,%ymm21,%ymm5
+ vpunpckhqdq %ymm22,%ymm20,%ymm6
+ vpunpckhqdq %ymm23,%ymm21,%ymm7
+ vpunpcklqdq %ymm26,%ymm24,%ymm8
+ vpunpcklqdq %ymm27,%ymm25,%ymm9
+ vpunpckhqdq %ymm26,%ymm24,%ymm10
+ vpunpckhqdq %ymm27,%ymm25,%ymm11
+ vpunpcklqdq %ymm30,%ymm28,%ymm12
+ vpunpcklqdq %ymm31,%ymm29,%ymm13
+ vpunpckhqdq %ymm30,%ymm28,%ymm14
+ vpunpckhqdq %ymm31,%ymm29,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa64 %ymm0,%ymm16
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
+ cmp $0x0020,%rcx
+ jl .Lxorpart8
+ vpxord 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0000(%rsi)
+ vmovdqa64 %ymm16,%ymm0
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rcx
+ jl .Lxorpart8
+ vpxord 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
+ cmp $0x0060,%rcx
+ jl .Lxorpart8
+ vpxord 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rcx
+ jl .Lxorpart8
+ vpxord 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
+ cmp $0x00e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rcx
+ jl .Lxorpart8
+ vpxord 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa64 %ymm4,%ymm0
+ cmp $0x0120,%rcx
+ jl .Lxorpart8
+ vpxord 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0100(%rsi)
+
+ vmovdqa64 %ymm12,%ymm0
+ cmp $0x0140,%rcx
+ jl .Lxorpart8
+ vpxord 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0120(%rsi)
+
+ vmovdqa64 %ymm6,%ymm0
+ cmp $0x0160,%rcx
+ jl .Lxorpart8
+ vpxord 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0140(%rsi)
+
+ vmovdqa64 %ymm14,%ymm0
+ cmp $0x0180,%rcx
+ jl .Lxorpart8
+ vpxord 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0160(%rsi)
+
+ vmovdqa64 %ymm5,%ymm0
+ cmp $0x01a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0180(%rsi)
+
+ vmovdqa64 %ymm13,%ymm0
+ cmp $0x01c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01a0(%rsi)
+
+ vmovdqa64 %ymm7,%ymm0
+ cmp $0x01e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01c0(%rsi)
+
+ vmovdqa64 %ymm15,%ymm0
+ cmp $0x0200,%rcx
+ jl .Lxorpart8
+ vpxord 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0x1f,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0x1f,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
+ vpxord %ymm0,%ymm1,%ymm1
+ vmovdqu8 %ymm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone8
+
+ENDPROC(chacha_8block_xor_avx512vl)
diff --git a/marvell/linux/arch/x86/crypto/chacha-ssse3-x86_64.S b/marvell/linux/arch/x86/crypto/chacha-ssse3-x86_64.S
new file mode 100644
index 0000000..2d86c7d
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+.section .rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC: .octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round. 8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+chacha_permute:
+
+ movdqa ROT8(%rip),%xmm4
+ movdqa ROT16(%rip),%xmm5
+
+.Ldoubleround:
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm3,%xmm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm3,%xmm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 1 data block output, o
+ # %rdx: up to 1 data block input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+ FRAME_BEGIN
+
+ # x0..3 = s0..3
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+
+ mov %rcx,%rax
+ call chacha_permute
+
+ # o0 = i0 ^ (x0 + s0)
+ paddd %xmm8,%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart
+ movdqu 0x00(%rdx),%xmm4
+ pxor %xmm4,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+ # o1 = i1 ^ (x1 + s1)
+ paddd %xmm9,%xmm1
+ movdqa %xmm1,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart
+ movdqu 0x10(%rdx),%xmm0
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
+ # o2 = i2 ^ (x2 + s2)
+ paddd %xmm10,%xmm2
+ movdqa %xmm2,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart
+ movdqu 0x20(%rdx),%xmm0
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+ # o3 = i3 ^ (x3 + s3)
+ paddd %xmm11,%xmm3
+ movdqa %xmm3,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart
+ movdqu 0x30(%rdx),%xmm0
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+.Ldone:
+ FRAME_END
+ ret
+
+.Lxorpart:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone
+
+ENDPROC(chacha_block_xor_ssse3)
+
+ENTRY(hchacha_block_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: output (8 32-bit words)
+ # %edx: nrounds
+ FRAME_BEGIN
+
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+
+ mov %edx,%r8d
+ call chacha_permute
+
+ movdqu %xmm0,0x00(%rsi)
+ movdqu %xmm3,0x10(%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(hchacha_block_ssse3)
+
+ENTRY(chacha_4block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four consecutive ChaCha blocks by loading the
+ # the state matrix in SSE registers four times. As we need some scratch
+ # registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32- and then 64-bit words,
+ # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+ # done with the slightly better performing SSSE3 byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ lea 8(%rsp),%r10
+ sub $0x80,%rsp
+ and $~63,%rsp
+ mov %rcx,%rax
+
+ # x0..15[0-3] = s0..3[0..3]
+ movq 0x00(%rdi),%xmm1
+ pshufd $0x00,%xmm1,%xmm0
+ pshufd $0x55,%xmm1,%xmm1
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ movq 0x10(%rdi),%xmm5
+ pshufd $0x00,%xmm5,%xmm4
+ pshufd $0x55,%xmm5,%xmm5
+ movq 0x18(%rdi),%xmm7
+ pshufd $0x00,%xmm7,%xmm6
+ pshufd $0x55,%xmm7,%xmm7
+ movq 0x20(%rdi),%xmm9
+ pshufd $0x00,%xmm9,%xmm8
+ pshufd $0x55,%xmm9,%xmm9
+ movq 0x28(%rdi),%xmm11
+ pshufd $0x00,%xmm11,%xmm10
+ pshufd $0x55,%xmm11,%xmm11
+ movq 0x30(%rdi),%xmm13
+ pshufd $0x00,%xmm13,%xmm12
+ pshufd $0x55,%xmm13,%xmm13
+ movq 0x38(%rdi),%xmm15
+ pshufd $0x00,%xmm15,%xmm14
+ pshufd $0x55,%xmm15,%xmm15
+ # x0..3 on stack
+ movdqa %xmm0,0x00(%rsp)
+ movdqa %xmm1,0x10(%rsp)
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm3,0x30(%rsp)
+
+ movdqa CTRINC(%rip),%xmm1
+ movdqa ROT8(%rip),%xmm2
+ movdqa ROT16(%rip),%xmm3
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+.Ldoubleround4:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # x0[0-3] += s0[0]
+ # x1[0-3] += s0[1]
+ movq 0x00(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x00(%rsp),%xmm2
+ movdqa %xmm2,0x00(%rsp)
+ paddd 0x10(%rsp),%xmm3
+ movdqa %xmm3,0x10(%rsp)
+ # x2[0-3] += s0[2]
+ # x3[0-3] += s0[3]
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x20(%rsp),%xmm2
+ movdqa %xmm2,0x20(%rsp)
+ paddd 0x30(%rsp),%xmm3
+ movdqa %xmm3,0x30(%rsp)
+
+ # x4[0-3] += s1[0]
+ # x5[0-3] += s1[1]
+ movq 0x10(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ # x6[0-3] += s1[2]
+ # x7[0-3] += s1[3]
+ movq 0x18(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+
+ # x8[0-3] += s2[0]
+ # x9[0-3] += s2[1]
+ movq 0x20(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm8
+ paddd %xmm3,%xmm9
+ # x10[0-3] += s2[2]
+ # x11[0-3] += s2[3]
+ movq 0x28(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm10
+ paddd %xmm3,%xmm11
+
+ # x12[0-3] += s3[0]
+ # x13[0-3] += s3[1]
+ movq 0x30(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm12
+ paddd %xmm3,%xmm13
+ # x14[0-3] += s3[2]
+ # x15[0-3] += s3[3]
+ movq 0x38(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm14
+ paddd %xmm3,%xmm15
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+ # interleave 32-bit words in state n, n+1
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x10(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x10(%rsp)
+ movdqa 0x20(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpckldq %xmm5,%xmm4
+ punpckhdq %xmm5,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm6,%xmm0
+ punpckldq %xmm7,%xmm6
+ punpckhdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm9,%xmm0
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,%xmm0
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpckldq %xmm13,%xmm12
+ punpckhdq %xmm13,%xmm0
+ movdqa %xmm0,%xmm13
+ movdqa %xmm14,%xmm0
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # interleave 64-bit words in state n, n+2
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x20(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x20(%rsp)
+ movdqa 0x10(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x10(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpcklqdq %xmm6,%xmm4
+ punpckhqdq %xmm6,%xmm0
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ punpcklqdq %xmm7,%xmm5
+ punpckhqdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpcklqdq %xmm10,%xmm8
+ punpckhqdq %xmm10,%xmm0
+ movdqa %xmm0,%xmm10
+ movdqa %xmm9,%xmm0
+ punpcklqdq %xmm11,%xmm9
+ punpckhqdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpcklqdq %xmm14,%xmm12
+ punpckhqdq %xmm14,%xmm0
+ movdqa %xmm0,%xmm14
+ movdqa %xmm13,%xmm0
+ punpcklqdq %xmm15,%xmm13
+ punpckhqdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # xor with corresponding input, write to output
+ movdqa 0x00(%rsp),%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ movdqu 0x00(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+
+ movdqu %xmm4,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ movdqu 0x10(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
+
+ movdqu %xmm8,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+
+ movdqu %xmm12,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+ movdqa 0x20(%rsp),%xmm0
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ movdqu 0x40(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x40(%rsi)
+
+ movdqu %xmm6,%xmm0
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x50(%rsi)
+
+ movdqu %xmm10,%xmm0
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x60(%rsi)
+
+ movdqu %xmm14,%xmm0
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x70(%rsi)
+
+ movdqa 0x10(%rsp),%xmm0
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+
+ movdqu %xmm5,%xmm0
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x90(%rsi)
+
+ movdqu %xmm9,%xmm0
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xa0(%rsi)
+
+ movdqu %xmm13,%xmm0
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xb0(%rsi)
+
+ movdqa 0x30(%rsp),%xmm0
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ movdqu 0xc0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xc0(%rsi)
+
+ movdqu %xmm7,%xmm0
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ movdqu 0xd0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xd0(%rsi)
+
+ movdqu %xmm11,%xmm0
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ movdqu 0xe0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xe0(%rsi)
+
+ movdqu %xmm15,%xmm0
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ movdqu 0xf0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xf0(%rsi)
+
+.Ldone4:
+ lea -8(%r10),%rsp
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_ssse3)
diff --git a/marvell/linux/arch/x86/crypto/chacha_glue.c b/marvell/linux/arch/x86/crypto/chacha_glue.c
new file mode 100644
index 0000000..388f95a
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/chacha_glue.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+#define CHACHA_STATE_ALIGN 16
+
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+static bool chacha_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+static bool chacha_use_avx512vl;
+#endif
+#endif
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+ len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+ return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+ if (chacha_use_avx512vl) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_2block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+#endif
+ if (chacha_use_avx2) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+#endif
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state[12]++;
+ }
+}
+
+static int chacha_simd_stream_xor(struct skcipher_walk *walk,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ int next_yield = 4096; /* bytes until next FPU yield */
+ int err = 0;
+
+ BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ while (walk->nbytes > 0) {
+ unsigned int nbytes = walk->nbytes;
+
+ if (nbytes < walk->total) {
+ nbytes = round_down(nbytes, walk->stride);
+ next_yield -= nbytes;
+ }
+
+ chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes, ctx->nrounds);
+
+ if (next_yield <= 0) {
+ /* temporarily allow preemption */
+ kernel_fpu_end();
+ kernel_fpu_begin();
+ next_yield = 4096;
+ }
+
+ err = skcipher_walk_done(walk, walk->nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int chacha_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+ return crypto_chacha_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ err = chacha_simd_stream_xor(&walk, ctx, req->iv);
+ kernel_fpu_end();
+ return err;
+}
+
+static int xchacha_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ struct chacha_ctx subctx;
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ u8 real_iv[16];
+ int err;
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+ return crypto_xchacha_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
+
+ BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ crypto_chacha_init(state, ctx, req->iv);
+
+ kernel_fpu_begin();
+
+ hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
+
+ kernel_fpu_end();
+
+ return err;
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha_simd,
+ .decrypt = chacha_simd,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = xchacha_simd,
+ .decrypt = xchacha_simd,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha12_setkey,
+ .encrypt = xchacha_simd,
+ .decrypt = xchacha_simd,
+ },
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+ chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+ chacha_use_avx512vl = chacha_use_avx2 &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
+#endif
+ return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/marvell/linux/arch/x86/crypto/crc32-pclmul_asm.S b/marvell/linux/arch/x86/crypto/crc32-pclmul_asm.S
new file mode 100644
index 0000000..1c099dc
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/crc32-pclmul_asm.S
@@ -0,0 +1,241 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+
+.section .rodata
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+ .octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+ .octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+.Lconstant_R5:
+ .octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+ .octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+ .octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define BUF %rdi
+#define LEN %rsi
+#define CRC %edx
+#else
+#define BUF %eax
+#define LEN %edx
+#define CRC %ecx
+#endif
+
+
+
+.text
+/**
+ * Calculate crc32
+ * BUF - buffer (16 bytes aligned)
+ * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pclmul_le_16(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+
+ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+ movdqa (BUF), %xmm1
+ movdqa 0x10(BUF), %xmm2
+ movdqa 0x20(BUF), %xmm3
+ movdqa 0x30(BUF), %xmm4
+ movd CRC, CONSTANT
+ pxor CONSTANT, %xmm1
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jb less_64
+
+#ifdef __x86_64__
+ movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R2R1, CONSTANT
+#endif
+
+loop_64:/* 64 bytes Full cache line folding */
+ prefetchnta 0x40(BUF)
+ movdqa %xmm1, %xmm5
+ movdqa %xmm2, %xmm6
+ movdqa %xmm3, %xmm7
+#ifdef __x86_64__
+ movdqa %xmm4, %xmm8
+#endif
+ PCLMULQDQ 00, CONSTANT, %xmm1
+ PCLMULQDQ 00, CONSTANT, %xmm2
+ PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+ PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ PCLMULQDQ 0x11, CONSTANT, %xmm6
+ PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+ PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+ pxor %xmm5, %xmm1
+ pxor %xmm6, %xmm2
+ pxor %xmm7, %xmm3
+#ifdef __x86_64__
+ pxor %xmm8, %xmm4
+#else
+ /* xmm8 unsupported for x32 */
+ movdqa %xmm4, %xmm5
+ PCLMULQDQ 00, CONSTANT, %xmm4
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm4
+#endif
+
+ pxor (BUF), %xmm1
+ pxor 0x10(BUF), %xmm2
+ pxor 0x20(BUF), %xmm3
+ pxor 0x30(BUF), %xmm4
+
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jge loop_64
+less_64:/* Folding cache line into 128bit */
+#ifdef __x86_64__
+ movdqa .Lconstant_R4R3(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R4R3, CONSTANT
+#endif
+ prefetchnta (BUF)
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm2, %xmm1
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm3, %xmm1
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm4, %xmm1
+
+ cmp $0x10, LEN
+ jb fold_64
+loop_16:/* Folding rest buffer into 128bit */
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor (BUF), %xmm1
+ sub $0x10, LEN
+ add $0x10, BUF
+ cmp $0x10, LEN
+ jge loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+ psrldq $0x08, %xmm1
+ pxor CONSTANT, %xmm1
+
+ /* final 32-bit fold */
+ movdqa %xmm1, %xmm2
+#ifdef __x86_64__
+ movdqa .Lconstant_R5(%rip), CONSTANT
+ movdqa .Lconstant_mask32(%rip), %xmm3
+#else
+ movdqa .Lconstant_R5, CONSTANT
+ movdqa .Lconstant_mask32, %xmm3
+#endif
+ psrldq $0x04, %xmm2
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+ movdqa .Lconstant_RUpoly(%rip), CONSTANT
+#else
+ movdqa .Lconstant_RUpoly, CONSTANT
+#endif
+ movdqa %xmm1, %xmm2
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x10, CONSTANT, %xmm1
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+ PEXTRD 0x01, %xmm1, %eax
+
+ ret
+ENDPROC(crc32_pclmul_le_16)
diff --git a/marvell/linux/arch/x86/crypto/crc32-pclmul_glue.c b/marvell/linux/arch/x86/crypto/crc32-pclmul_glue.c
new file mode 100644
index 0000000..cb4ab66
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/crc32-pclmul_glue.c
@@ -0,0 +1,203 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define PCLMUL_MIN_LEN 64L /* minimum size of buffer
+ * for crc32_pclmul_le_16 */
+#define SCALE_F 16L /* size of xmm register */
+#define SCALE_F_MASK (SCALE_F - 1)
+
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+
+static u32 __attribute__((pure))
+ crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient;
+ unsigned int iremainder;
+ unsigned int prealign;
+
+ if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
+ return crc32_le(crc, p, len);
+
+ if ((long)p & SCALE_F_MASK) {
+ /* align p to 16 byte */
+ prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+
+ crc = crc32_le(crc, p, prealign);
+ len -= prealign;
+ p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+ ~SCALE_F_MASK);
+ }
+ iquotient = len & (~SCALE_F_MASK);
+ iremainder = len & SCALE_F_MASK;
+
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(p, iquotient, crc);
+ kernel_fpu_end();
+
+ if (iremainder)
+ crc = crc32_le(crc, p + iquotient, iremainder);
+
+ return crc;
+}
+
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = 0;
+
+ return 0;
+}
+
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = crc32_pclmul_le(*crcp, data, len);
+ return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+ return 0;
+}
+
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *(__le32 *)out = cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
+static struct shash_alg alg = {
+ .setkey = crc32_pclmul_setkey,
+ .init = crc32_pclmul_init,
+ .update = crc32_pclmul_update,
+ .final = crc32_pclmul_final,
+ .finup = crc32_pclmul_finup,
+ .digest = crc32_pclmul_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32",
+ .cra_driver_name = "crc32-pclmul",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_pclmul_cra_init,
+ }
+};
+
+static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
+
+
+static int __init crc32_pclmul_mod_init(void)
+{
+
+ if (!x86_match_cpu(crc32pclmul_cpu_id)) {
+ pr_info("PCLMULQDQ-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc32_pclmul_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crc32_pclmul_mod_init);
+module_exit(crc32_pclmul_mod_fini);
+
+MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32-pclmul");
diff --git a/marvell/linux/arch/x86/crypto/crc32c-intel_glue.c b/marvell/linux/arch/x86/crypto/crc32c-intel_glue.c
new file mode 100644
index 0000000..eefa086
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/crc32c-intel_glue.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ * Kent Liu <kent.liu@intel.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define SCALE_F sizeof(unsigned long)
+
+#ifdef CONFIG_X86_64
+#define REX_PRE "0x48, "
+#else
+#define REX_PRE
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * use carryless multiply version of crc32c when buffer
+ * size is >= 512 to account
+ * for fpu state save/restore overhead.
+ */
+#define CRC32C_PCL_BREAKEVEN 512
+
+asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+ unsigned int crc_init);
+#endif /* CONFIG_X86_64 */
+
+static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+{
+ while (length--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+ :"=S"(crc)
+ :"0"(crc), "c"(*data)
+ );
+ data++;
+ }
+
+ return crc;
+}
+
+static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient = len / SCALE_F;
+ unsigned int iremainder = len % SCALE_F;
+ unsigned long *ptmp = (unsigned long *)p;
+
+ while (iquotient--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+ :"=S"(crc)
+ :"0"(crc), "c"(*ptmp)
+ );
+ ptmp++;
+ }
+
+ if (iremainder)
+ crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+ iremainder);
+
+ return crc;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32c_intel_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = crc32c_intel_le_hw(*crcp, data, len);
+ return 0;
+}
+
+static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ return 0;
+}
+
+static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *(__le32 *)out = ~cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
+static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ /*
+ * use faster PCL version if datasize is large enough to
+ * overcome kernel fpu state save/restore overhead
+ */
+ if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, len, *crcp);
+ kernel_fpu_end();
+ } else
+ *crcp = crc32c_intel_le_hw(*crcp, data, len);
+ return 0;
+}
+
+static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ kernel_fpu_end();
+ } else
+ *(__le32 *)out =
+ ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ return 0;
+}
+
+static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+#endif /* CONFIG_X86_64 */
+
+static struct shash_alg alg = {
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .finup = crc32c_intel_finup,
+ .digest = crc32c_intel_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32c_intel_cra_init,
+ }
+};
+
+static const struct x86_cpu_id crc32c_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
+
+static int __init crc32c_intel_mod_init(void)
+{
+ if (!x86_match_cpu(crc32c_cpu_id))
+ return -ENODEV;
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ alg.update = crc32c_pcl_intel_update;
+ alg.finup = crc32c_pcl_intel_finup;
+ alg.digest = crc32c_pcl_intel_digest;
+ }
+#endif
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc32c_intel_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crc32c_intel_mod_init);
+module_exit(crc32c_intel_mod_fini);
+
+MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-intel");
diff --git a/marvell/linux/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/marvell/linux/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
new file mode 100644
index 0000000..3c6e015
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -0,0 +1,464 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * James Guilford <james.guilford@intel.com>
+ * David Cote <david.m.cote@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/inst.h>
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JMPTBL_ENTRY i
+.word crc_\i - crc_array
+.endm
+
+.macro JNC_LESS_THAN j
+ jnc less_than_\j
+.endm
+
+# Define threshold where buffers are considered "small" and routed to more
+# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+# SMALL_SIZE can be no larger than 255.
+
+#define SMALL_SIZE 200
+
+.if (SMALL_SIZE > 255)
+.error "SMALL_ SIZE must be < 256"
+.endif
+
+# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+
+.text
+ENTRY(crc_pcl)
+#define bufp %rdi
+#define bufp_dw %edi
+#define bufp_w %di
+#define bufp_b %dil
+#define bufptmp %rcx
+#define block_0 %rcx
+#define block_1 %rdx
+#define block_2 %r11
+#define len %rsi
+#define len_dw %esi
+#define len_w %si
+#define len_b %sil
+#define crc_init_arg %rdx
+#define tmp %rbx
+#define crc_init %r8
+#define crc_init_dw %r8d
+#define crc1 %r9
+#define crc2 %r10
+
+ pushq %rbx
+ pushq %rdi
+ pushq %rsi
+
+ ## Move crc_init for Linux to a different
+ mov crc_init_arg, crc_init
+
+ ################################################################
+ ## 1) ALIGN:
+ ################################################################
+
+ mov bufp, bufptmp # rdi = *buf
+ neg bufp
+ and $7, bufp # calculate the unalignment amount of
+ # the address
+ je proc_block # Skip if aligned
+
+ ## If len is less than 8 and we're unaligned, we need to jump
+ ## to special code to avoid reading beyond the end of the buffer
+ cmp $8, len
+ jae do_align
+ # less_than_8 expects length in upper 3 bits of len_dw
+ # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+ shl $32-3+1, len_dw
+ jmp less_than_8_post_shl1
+
+do_align:
+ #### Calculate CRC of unaligned bytes of the buffer (if any)
+ movq (bufptmp), tmp # load a quadward from the buffer
+ add bufp, bufptmp # align buffer pointer for quadword
+ # processing
+ sub bufp, len # update buffer length
+align_loop:
+ crc32b %bl, crc_init_dw # compute crc32 of 1-byte
+ shr $8, tmp # get next byte
+ dec bufp
+ jne align_loop
+
+proc_block:
+
+ ################################################################
+ ## 2) PROCESS BLOCKS:
+ ################################################################
+
+ ## compute num of bytes to be processed
+ movq len, tmp # save num bytes in tmp
+
+ cmpq $128*24, len
+ jae full_block
+
+continue_block:
+ cmpq $SMALL_SIZE, len
+ jb small
+
+ ## len < 128*24
+ movq $2731, %rax # 2731 = ceil(2^16 / 24)
+ mul len_dw
+ shrq $16, %rax
+
+ ## eax contains floor(bytes / 24) = num 24-byte chunks to do
+
+ ## process rax 24-byte chunks (128 >= rax >= 0)
+
+ ## compute end address of each block
+ ## block 0 (base addr + RAX * 8)
+ ## block 1 (base addr + RAX * 16)
+ ## block 2 (base addr + RAX * 24)
+ lea (bufptmp, %rax, 8), block_0
+ lea (block_0, %rax, 8), block_1
+ lea (block_1, %rax, 8), block_2
+
+ xor crc1, crc1
+ xor crc2, crc2
+
+ ## branch into array
+ lea jump_table(%rip), bufp
+ movzwq (bufp, %rax, 2), len
+ lea crc_array(%rip), bufp
+ lea (bufp, len, 1), bufp
+ JMP_NOSPEC bufp
+
+ ################################################################
+ ## 2a) PROCESS FULL BLOCKS:
+ ################################################################
+full_block:
+ movl $128,%eax
+ lea 128*8*2(block_0), block_1
+ lea 128*8*3(block_0), block_2
+ add $128*8*1, block_0
+
+ xor crc1,crc1
+ xor crc2,crc2
+
+ # Fall thruogh into top of crc array (crc_128)
+
+ ################################################################
+ ## 3) CRC Array:
+ ################################################################
+
+crc_array:
+ i=128
+.rept 128-1
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+ crc32q -i*8(block_0), crc_init
+ crc32q -i*8(block_1), crc1
+ crc32q -i*8(block_2), crc2
+ i=(i-1)
+.endr
+
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+ crc32q -i*8(block_0), crc_init
+ crc32q -i*8(block_1), crc1
+# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
+
+ mov block_2, block_0
+
+ ################################################################
+ ## 4) Combine three results:
+ ################################################################
+
+ lea (K_table-8)(%rip), bufp # first entry is for idx 1
+ shlq $3, %rax # rax *= 8
+ pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+ leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
+ subq %rax, tmp # tmp -= rax*24
+
+ movq crc_init, %xmm1 # CRC for block 1
+ PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
+
+ movq crc1, %xmm2 # CRC for block 2
+ PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1
+
+ pxor %xmm2,%xmm1
+ movq %xmm1, %rax
+ xor -i*8(block_2), %rax
+ mov crc2, crc_init
+ crc32 %rax, crc_init
+
+ ################################################################
+ ## 5) Check for end:
+ ################################################################
+
+LABEL crc_ 0
+ mov tmp, len
+ cmp $128*24, tmp
+ jae full_block
+ cmp $24, tmp
+ jae continue_block
+
+less_than_24:
+ shl $32-4, len_dw # less_than_16 expects length
+ # in upper 4 bits of len_dw
+ jnc less_than_16
+ crc32q (bufptmp), crc_init
+ crc32q 8(bufptmp), crc_init
+ jz do_return
+ add $16, bufptmp
+ # len is less than 8 if we got here
+ # less_than_8 expects length in upper 3 bits of len_dw
+ # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+ shl $2, len_dw
+ jmp less_than_8_post_shl1
+
+ #######################################################################
+ ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+ #######################################################################
+small:
+ shl $32-8, len_dw # Prepare len_dw for less_than_256
+ j=256
+.rept 5 # j = {256, 128, 64, 32, 16}
+.altmacro
+LABEL less_than_ %j # less_than_j: Length should be in
+ # upper lg(j) bits of len_dw
+ j=(j/2)
+ shl $1, len_dw # Get next MSB
+ JNC_LESS_THAN %j
+.noaltmacro
+ i=0
+.rept (j/8)
+ crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
+ i=i+8
+.endr
+ jz do_return # Return if remaining length is zero
+ add $j, bufptmp # Advance buf
+.endr
+
+less_than_8: # Length should be stored in
+ # upper 3 bits of len_dw
+ shl $1, len_dw
+less_than_8_post_shl1:
+ jnc less_than_4
+ crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
+ jz do_return # return if remaining data is zero
+ add $4, bufptmp
+less_than_4: # Length should be stored in
+ # upper 2 bits of len_dw
+ shl $1, len_dw
+ jnc less_than_2
+ crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
+ jz do_return # return if remaining data is zero
+ add $2, bufptmp
+less_than_2: # Length should be stored in the MSB
+ # of len_dw
+ shl $1, len_dw
+ jnc less_than_1
+ crc32b (bufptmp), crc_init_dw # CRC of 1 byte
+less_than_1: # Length should be zero
+do_return:
+ movq crc_init, %rax
+ popq %rsi
+ popq %rdi
+ popq %rbx
+ ret
+ENDPROC(crc_pcl)
+
+.section .rodata, "a", @progbits
+ ################################################################
+ ## jump table Table is 129 entries x 2 bytes each
+ ################################################################
+.align 4
+jump_table:
+ i=0
+.rept 129
+.altmacro
+JMPTBL_ENTRY %i
+.noaltmacro
+ i=i+1
+.endr
+
+
+ ################################################################
+ ## PCLMULQDQ tables
+ ## Table is 128 entries x 2 words (8 bytes) each
+ ################################################################
+.align 8
+K_table:
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/marvell/linux/arch/x86/crypto/crct10dif-pcl-asm_64.S b/marvell/linux/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 0000000..3d873e6
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,333 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+# Erdinc Ozturk <erdinc.ozturk@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# James Guilford <james.guilford@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Reference paper titled "Fast CRC Computation for Generic
+# Polynomials Using PCLMULQDQ Instruction"
+# URL: http://www.intel.com/content/dam/www/public/us/en/documents
+# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define init_crc %edi
+#define buf %rsi
+#define len %rdx
+
+#define FOLD_CONSTS %xmm10
+#define BSWAP_MASK %xmm11
+
+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
+# reg1, reg2.
+.macro fold_32_bytes offset, reg1, reg2
+ movdqu \offset(buf), %xmm9
+ movdqu \offset+16(buf), %xmm12
+ pshufb BSWAP_MASK, %xmm9
+ pshufb BSWAP_MASK, %xmm12
+ movdqa \reg1, %xmm8
+ movdqa \reg2, %xmm13
+ pclmulqdq $0x00, FOLD_CONSTS, \reg1
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm8
+ pclmulqdq $0x00, FOLD_CONSTS, \reg2
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm13
+ pxor %xmm9 , \reg1
+ xorps %xmm8 , \reg1
+ pxor %xmm12, \reg2
+ xorps %xmm13, \reg2
+.endm
+
+# Fold src_reg into dst_reg.
+.macro fold_16_bytes src_reg, dst_reg
+ movdqa \src_reg, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, \src_reg
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, \dst_reg
+ xorps \src_reg, \dst_reg
+.endm
+
+#
+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
+#
+# Assumes len >= 16.
+#
+.align 16
+ENTRY(crc_t10dif_pcl)
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp $256, len
+ jl .Lless_than_256_bytes
+
+ # Load the first 128 data bytes. Byte swapping is necessary to make the
+ # bit order match the polynomial coefficient order.
+ movdqu 16*0(buf), %xmm0
+ movdqu 16*1(buf), %xmm1
+ movdqu 16*2(buf), %xmm2
+ movdqu 16*3(buf), %xmm3
+ movdqu 16*4(buf), %xmm4
+ movdqu 16*5(buf), %xmm5
+ movdqu 16*6(buf), %xmm6
+ movdqu 16*7(buf), %xmm7
+ add $128, buf
+ pshufb BSWAP_MASK, %xmm0
+ pshufb BSWAP_MASK, %xmm1
+ pshufb BSWAP_MASK, %xmm2
+ pshufb BSWAP_MASK, %xmm3
+ pshufb BSWAP_MASK, %xmm4
+ pshufb BSWAP_MASK, %xmm5
+ pshufb BSWAP_MASK, %xmm6
+ pshufb BSWAP_MASK, %xmm7
+
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm8, %xmm8
+ pinsrw $7, init_crc, %xmm8
+ pxor %xmm8, %xmm0
+
+ movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
+
+ # Subtract 128 for the 128 data bytes just consumed. Subtract another
+ # 128 to simplify the termination condition of the following loop.
+ sub $256, len
+
+ # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
+ # bytes xmm0-7 into them, storing the result back into xmm0-7.
+.Lfold_128_bytes_loop:
+ fold_32_bytes 0, %xmm0, %xmm1
+ fold_32_bytes 32, %xmm2, %xmm3
+ fold_32_bytes 64, %xmm4, %xmm5
+ fold_32_bytes 96, %xmm6, %xmm7
+ add $128, buf
+ sub $128, len
+ jge .Lfold_128_bytes_loop
+
+ # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
+
+ # Fold across 64 bytes.
+ movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm0, %xmm4
+ fold_16_bytes %xmm1, %xmm5
+ fold_16_bytes %xmm2, %xmm6
+ fold_16_bytes %xmm3, %xmm7
+ # Fold across 32 bytes.
+ movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm4, %xmm6
+ fold_16_bytes %xmm5, %xmm7
+ # Fold across 16 bytes.
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm6, %xmm7
+
+ # Add 128 to get the correct number of data bytes remaining in 0...127
+ # (not counting xmm7), following the previous extra subtraction by 128.
+ # Then subtract 16 to simplify the termination condition of the
+ # following loop.
+ add $128-16, len
+
+ # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
+ # xmm7 into them, storing the result back into xmm7.
+ jl .Lfold_16_bytes_loop_done
+.Lfold_16_bytes_loop:
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, %xmm7
+ movdqu (buf), %xmm0
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0 , %xmm7
+ add $16, buf
+ sub $16, len
+ jge .Lfold_16_bytes_loop
+
+.Lfold_16_bytes_loop_done:
+ # Add 16 to get the correct number of data bytes remaining in 0...15
+ # (not counting xmm7), following the previous extra subtraction by 16.
+ add $16, len
+ je .Lreduce_final_16_bytes
+
+.Lhandle_partial_segment:
+ # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
+ # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do
+ # this without needing a fold constant for each possible 'len', redivide
+ # the bytes into a first chunk of 'len' bytes and a second chunk of 16
+ # bytes, then fold the first chunk into the second.
+
+ movdqa %xmm7, %xmm2
+
+ # xmm1 = last 16 original data bytes
+ movdqu -16(buf, len), %xmm1
+ pshufb BSWAP_MASK, %xmm1
+
+ # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
+ lea .Lbyteshift_table+16(%rip), %rax
+ sub len, %rax
+ movdqu (%rax), %xmm0
+ pshufb %xmm0, %xmm2
+
+ # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
+ pxor .Lmask1(%rip), %xmm0
+ pshufb %xmm0, %xmm7
+
+ # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
+ # then '16-len' bytes from xmm2 (high-order bytes).
+ pblendvb %xmm2, %xmm1 #xmm0 is implicit
+
+ # Fold the first chunk into the second chunk, storing the result in xmm7.
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm1, %xmm7
+
+.Lreduce_final_16_bytes:
+ # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
+
+ # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS
+
+ # Fold the high 64 bits into the low 64 bits, while also multiplying by
+ # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ # whose low 48 bits are 0.
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
+ pslldq $8, %xmm0
+ pxor %xmm0, %xmm7 # + low bits * x^64
+
+ # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ # value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ movdqa %xmm7, %xmm0
+ pand .Lmask2(%rip), %xmm0 # zero high 32 bits
+ psrldq $12, %xmm7 # extract high 32 bits
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
+ pxor %xmm0, %xmm7 # + low bits
+
+ # Load G(x) and floor(x^48 / G(x)).
+ movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
+
+ # Use Barrett reduction to compute the final CRC value.
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
+ psrlq $32, %xmm7 # /= x^32
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
+ psrlq $48, %xmm0
+ pxor %xmm7, %xmm0 # + low 16 nonzero bits
+ # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
+
+ pextrw $0, %xmm0, %eax
+ ret
+
+.align 16
+.Lless_than_256_bytes:
+ # Checksumming a buffer of length 16...255 bytes
+
+ # Load the first 16 data bytes.
+ movdqu (buf), %xmm7
+ pshufb BSWAP_MASK, %xmm7
+ add $16, buf
+
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm0, %xmm0
+ pinsrw $7, init_crc, %xmm0
+ pxor %xmm0, %xmm7
+
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ cmp $16, len
+ je .Lreduce_final_16_bytes # len == 16
+ sub $32, len
+ jge .Lfold_16_bytes_loop # 32 <= len <= 255
+ add $16, len
+ jmp .Lhandle_partial_segment # 17 <= len <= 31
+ENDPROC(crc_t10dif_pcl)
+
+.section .rodata, "a", @progbits
+.align 16
+
+# Fold constants precomputed from the polynomial 0x18bb7
+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 # x^(8*128) mod G(x)
+ .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
+.Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 # x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
+.Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d # x^(2*128) mod G(x)
+ .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 # x^(1*128) mod G(x)
+ .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
+.Lfinal_fold_consts:
+ .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
+.Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 # G(x)
+ .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
+
+.section .rodata.cst16.mask1, "aM", @progbits, 16
+.align 16
+.Lmask1:
+ .octa 0x80808080808080808080808080808080
+
+.section .rodata.cst16.mask2, "aM", @progbits, 16
+.align 16
+.Lmask2:
+ .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+
+.section .rodata.cst32.byteshift_table, "aM", @progbits, 32
+.align 16
+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/marvell/linux/arch/x86/crypto/crct10dif-pclmul_glue.c b/marvell/linux/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 0000000..3c81e15
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,143 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
+
+struct chksum_desc_ctx {
+ __u16 crc;
+};
+
+static int chksum_init(struct shash_desc *desc)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = 0;
+
+ return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ if (length >= 16 && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+ kernel_fpu_end();
+ } else
+ ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__u16 *)out = ctx->crc;
+ return 0;
+}
+
+static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
+{
+ if (len >= 16 && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
+ kernel_fpu_end();
+ } else
+ *(__u16 *)out = crc_t10dif_generic(crc, data, len);
+ return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ return __chksum_finup(0, data, length, out);
+}
+
+static struct shash_alg alg = {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+ .base = {
+ .cra_name = "crct10dif",
+ .cra_driver_name = "crct10dif-pclmul",
+ .cra_priority = 200,
+ .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+
+static int __init crct10dif_intel_mod_init(void)
+{
+ if (!x86_match_cpu(crct10dif_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_intel_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-pclmul");
diff --git a/marvell/linux/arch/x86/crypto/des3_ede-asm_64.S b/marvell/linux/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 0000000..7fca430
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,799 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rsi
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %esi
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+ movl a, RT0d; \
+ shrl $(offset), RT0d; \
+ xorl b, RT0d; \
+ andl $(mask), RT0d; \
+ xorl RT0d, b; \
+ shll $(offset), RT0d; \
+ xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ movl left##d, RW0d; \
+ roll $1, right##d; \
+ xorl right##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##d; \
+ xorl RW0d, right##d; \
+ roll $1, left##d; \
+ expand_to_64bits(right, RT3); \
+ expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+ compress_to_64bits(right); \
+ compress_to_64bits(left); \
+ movl right##d, RW0d; \
+ rorl $1, left##d; \
+ xorl left##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##d; \
+ xorl RW0d, left##d; \
+ rorl $1, right##d; \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+ xorq from, RW0; \
+ \
+ movzbl RW0bl, RT0d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ shrq $16, RW0; \
+ movq s8(, RT0, 8), RT0; \
+ xorq s6(, RT1, 8), to; \
+ movzbl RW0bl, RL1d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s4(, RT2, 8), RT0; \
+ xorq s2(, RT3, 8), to; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ xorq s7(, RL1, 8), RT0; \
+ xorq s5(, RT1, 8), to; \
+ xorq s3(, RT2, 8), RT0; \
+ load_next_key(n, RW0); \
+ xorq RT0, to; \
+ xorq s1(, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+ movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+ movl (io), left##d; \
+ movl 4(io), right##d; \
+ bswapl left##d; \
+ bswapl right##d;
+
+#define write_block(io, left, right) \
+ bswapl left##d; \
+ bswapl right##d; \
+ movl left##d, (io); \
+ movl right##d, 4(io);
+
+ENTRY(des3_ede_x86_64_crypt_blk)
+ /* input:
+ * %rdi: round keys, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ pushq %rsi; /* dst */
+
+ read_block(%rdx, RL0, RR0);
+ initial_permutation(RL0, RR0);
+
+ movq (CTX), RW0;
+
+ round1(0, RR0, RL0, load_next_key);
+ round1(1, RL0, RR0, load_next_key);
+ round1(2, RR0, RL0, load_next_key);
+ round1(3, RL0, RR0, load_next_key);
+ round1(4, RR0, RL0, load_next_key);
+ round1(5, RL0, RR0, load_next_key);
+ round1(6, RR0, RL0, load_next_key);
+ round1(7, RL0, RR0, load_next_key);
+ round1(8, RR0, RL0, load_next_key);
+ round1(9, RL0, RR0, load_next_key);
+ round1(10, RR0, RL0, load_next_key);
+ round1(11, RL0, RR0, load_next_key);
+ round1(12, RR0, RL0, load_next_key);
+ round1(13, RL0, RR0, load_next_key);
+ round1(14, RR0, RL0, load_next_key);
+ round1(15, RL0, RR0, load_next_key);
+
+ round1(16+0, RL0, RR0, load_next_key);
+ round1(16+1, RR0, RL0, load_next_key);
+ round1(16+2, RL0, RR0, load_next_key);
+ round1(16+3, RR0, RL0, load_next_key);
+ round1(16+4, RL0, RR0, load_next_key);
+ round1(16+5, RR0, RL0, load_next_key);
+ round1(16+6, RL0, RR0, load_next_key);
+ round1(16+7, RR0, RL0, load_next_key);
+ round1(16+8, RL0, RR0, load_next_key);
+ round1(16+9, RR0, RL0, load_next_key);
+ round1(16+10, RL0, RR0, load_next_key);
+ round1(16+11, RR0, RL0, load_next_key);
+ round1(16+12, RL0, RR0, load_next_key);
+ round1(16+13, RR0, RL0, load_next_key);
+ round1(16+14, RL0, RR0, load_next_key);
+ round1(16+15, RR0, RL0, load_next_key);
+
+ round1(32+0, RR0, RL0, load_next_key);
+ round1(32+1, RL0, RR0, load_next_key);
+ round1(32+2, RR0, RL0, load_next_key);
+ round1(32+3, RL0, RR0, load_next_key);
+ round1(32+4, RR0, RL0, load_next_key);
+ round1(32+5, RL0, RR0, load_next_key);
+ round1(32+6, RR0, RL0, load_next_key);
+ round1(32+7, RL0, RR0, load_next_key);
+ round1(32+8, RR0, RL0, load_next_key);
+ round1(32+9, RL0, RR0, load_next_key);
+ round1(32+10, RR0, RL0, load_next_key);
+ round1(32+11, RL0, RR0, load_next_key);
+ round1(32+12, RR0, RL0, load_next_key);
+ round1(32+13, RL0, RR0, load_next_key);
+ round1(32+14, RR0, RL0, load_next_key);
+ round1(32+15, RL0, RR0, dummy2);
+
+ final_permutation(RR0, RL0);
+
+ popq %rsi /* dst */
+ write_block(%rsi, RR0, RL0);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ \
+ movl left##0d, RW0d; \
+ roll $1, right##0d; \
+ xorl right##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##0d; \
+ xorl RW0d, right##0d; \
+ roll $1, left##0d; \
+ expand_to_64bits(right##0, RT3); \
+ expand_to_64bits(left##0, RT3); \
+ movl left##1d, RW1d; \
+ roll $1, right##1d; \
+ xorl right##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, left##1d; \
+ xorl RW1d, right##1d; \
+ roll $1, left##1d; \
+ expand_to_64bits(right##1, RT3); \
+ expand_to_64bits(left##1, RT3); \
+ movl left##2d, RW2d; \
+ roll $1, right##2d; \
+ xorl right##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, left##2d; \
+ xorl RW2d, right##2d; \
+ roll $1, left##2d; \
+ expand_to_64bits(right##2, RT3); \
+ expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+ compress_to_64bits(right##0); \
+ compress_to_64bits(left##0); \
+ movl right##0d, RW0d; \
+ rorl $1, left##0d; \
+ xorl left##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##0d; \
+ xorl RW0d, left##0d; \
+ rorl $1, right##0d; \
+ compress_to_64bits(right##1); \
+ compress_to_64bits(left##1); \
+ movl right##1d, RW1d; \
+ rorl $1, left##1d; \
+ xorl left##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, right##1d; \
+ xorl RW1d, left##1d; \
+ rorl $1, right##1d; \
+ compress_to_64bits(right##2); \
+ compress_to_64bits(left##2); \
+ movl right##2d, RW2d; \
+ rorl $1, left##2d; \
+ xorl left##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, right##2d; \
+ xorl RW2d, left##2d; \
+ rorl $1, right##2d; \
+ \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+ xorq from##0, RW0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s8(, RT3, 8), to##0; \
+ xorq s6(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s4(, RT3, 8), to##0; \
+ xorq s2(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s7(, RT3, 8), to##0; \
+ xorq s5(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ load_next_key(n, RW0); \
+ xorq s3(, RT3, 8), to##0; \
+ xorq s1(, RT1, 8), to##0; \
+ xorq from##1, RW1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s8(, RT3, 8), to##1; \
+ xorq s6(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s4(, RT3, 8), to##1; \
+ xorq s2(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrl $16, RW1d; \
+ xorq s7(, RT3, 8), to##1; \
+ xorq s5(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ do_movq(RW0, RW1); \
+ xorq s3(, RT3, 8), to##1; \
+ xorq s1(, RT1, 8), to##1; \
+ xorq from##2, RW2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s8(, RT3, 8), to##2; \
+ xorq s6(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s4(, RT3, 8), to##2; \
+ xorq s2(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrl $16, RW2d; \
+ xorq s7(, RT3, 8), to##2; \
+ xorq s5(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ do_movq(RW0, RW2); \
+ xorq s3(, RT3, 8), to##2; \
+ xorq s1(, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+ movq src, dst;
+
+ENTRY(des3_ede_x86_64_crypt_blk_3way)
+ /* input:
+ * %rdi: ctx, round keys
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ */
+
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ pushq %rsi /* dst */
+
+ /* load input */
+ movl 0 * 4(%rdx), RL0d;
+ movl 1 * 4(%rdx), RR0d;
+ movl 2 * 4(%rdx), RL1d;
+ movl 3 * 4(%rdx), RR1d;
+ movl 4 * 4(%rdx), RL2d;
+ movl 5 * 4(%rdx), RR2d;
+
+ bswapl RL0d;
+ bswapl RR0d;
+ bswapl RL1d;
+ bswapl RR1d;
+ bswapl RL2d;
+ bswapl RR2d;
+
+ initial_permutation3(RL, RR);
+
+ movq 0(CTX), RW0;
+ movq RW0, RW1;
+ movq RW0, RW2;
+
+ round3(0, RR, RL, load_next_key, __movq);
+ round3(1, RL, RR, load_next_key, __movq);
+ round3(2, RR, RL, load_next_key, __movq);
+ round3(3, RL, RR, load_next_key, __movq);
+ round3(4, RR, RL, load_next_key, __movq);
+ round3(5, RL, RR, load_next_key, __movq);
+ round3(6, RR, RL, load_next_key, __movq);
+ round3(7, RL, RR, load_next_key, __movq);
+ round3(8, RR, RL, load_next_key, __movq);
+ round3(9, RL, RR, load_next_key, __movq);
+ round3(10, RR, RL, load_next_key, __movq);
+ round3(11, RL, RR, load_next_key, __movq);
+ round3(12, RR, RL, load_next_key, __movq);
+ round3(13, RL, RR, load_next_key, __movq);
+ round3(14, RR, RL, load_next_key, __movq);
+ round3(15, RL, RR, load_next_key, __movq);
+
+ round3(16+0, RL, RR, load_next_key, __movq);
+ round3(16+1, RR, RL, load_next_key, __movq);
+ round3(16+2, RL, RR, load_next_key, __movq);
+ round3(16+3, RR, RL, load_next_key, __movq);
+ round3(16+4, RL, RR, load_next_key, __movq);
+ round3(16+5, RR, RL, load_next_key, __movq);
+ round3(16+6, RL, RR, load_next_key, __movq);
+ round3(16+7, RR, RL, load_next_key, __movq);
+ round3(16+8, RL, RR, load_next_key, __movq);
+ round3(16+9, RR, RL, load_next_key, __movq);
+ round3(16+10, RL, RR, load_next_key, __movq);
+ round3(16+11, RR, RL, load_next_key, __movq);
+ round3(16+12, RL, RR, load_next_key, __movq);
+ round3(16+13, RR, RL, load_next_key, __movq);
+ round3(16+14, RL, RR, load_next_key, __movq);
+ round3(16+15, RR, RL, load_next_key, __movq);
+
+ round3(32+0, RR, RL, load_next_key, __movq);
+ round3(32+1, RL, RR, load_next_key, __movq);
+ round3(32+2, RR, RL, load_next_key, __movq);
+ round3(32+3, RL, RR, load_next_key, __movq);
+ round3(32+4, RR, RL, load_next_key, __movq);
+ round3(32+5, RL, RR, load_next_key, __movq);
+ round3(32+6, RR, RL, load_next_key, __movq);
+ round3(32+7, RL, RR, load_next_key, __movq);
+ round3(32+8, RR, RL, load_next_key, __movq);
+ round3(32+9, RL, RR, load_next_key, __movq);
+ round3(32+10, RR, RL, load_next_key, __movq);
+ round3(32+11, RL, RR, load_next_key, __movq);
+ round3(32+12, RR, RL, load_next_key, __movq);
+ round3(32+13, RL, RR, load_next_key, __movq);
+ round3(32+14, RR, RL, load_next_key, __movq);
+ round3(32+15, RL, RR, dummy2, dummy2);
+
+ final_permutation3(RR, RL);
+
+ bswapl RR0d;
+ bswapl RL0d;
+ bswapl RR1d;
+ bswapl RL1d;
+ bswapl RR2d;
+ bswapl RL2d;
+
+ popq %rsi /* dst */
+ movl RR0d, 0 * 4(%rsi);
+ movl RL0d, 1 * 4(%rsi);
+ movl RR1d, 2 * 4(%rsi);
+ movl RL1d, 3 * 4(%rsi);
+ movl RR2d, 4 * 4(%rsi);
+ movl RL2d, 5 * 4(%rsi);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk_3way)
+
+.section .rodata, "a", @progbits
+.align 16
+.L_s1:
+ .quad 0x0010100001010400, 0x0000000000000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0010100001010004, 0x0000100000010404
+ .quad 0x0000000000000004, 0x0000100000010000
+ .quad 0x0000000000000400, 0x0010100001010400
+ .quad 0x0010100001010404, 0x0000000000000400
+ .quad 0x0010000001000404, 0x0010100001010004
+ .quad 0x0010000001000000, 0x0000000000000004
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000100000010400
+ .quad 0x0000100000010400, 0x0010100001010000
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0000100000010004, 0x0010000001000004
+ .quad 0x0010000001000004, 0x0000100000010004
+ .quad 0x0000000000000000, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010000001000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0000000000000004, 0x0010100001010000
+ .quad 0x0010100001010400, 0x0010000001000000
+ .quad 0x0010000001000000, 0x0000000000000400
+ .quad 0x0010100001010004, 0x0000100000010000
+ .quad 0x0000100000010400, 0x0010000001000004
+ .quad 0x0000000000000400, 0x0000000000000004
+ .quad 0x0010000001000404, 0x0000100000010404
+ .quad 0x0010100001010404, 0x0000100000010004
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0010000001000004, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010100001010400
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000000000000000
+ .quad 0x0000100000010004, 0x0000100000010400
+ .quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+ .quad 0x0801080200100020, 0x0800080000000000
+ .quad 0x0000080000000000, 0x0001080200100020
+ .quad 0x0001000000100000, 0x0000000200000020
+ .quad 0x0801000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0801080200100020
+ .quad 0x0801080000100000, 0x0800000000000000
+ .quad 0x0800080000000000, 0x0001000000100000
+ .quad 0x0000000200000020, 0x0801000200100020
+ .quad 0x0001080000100000, 0x0001000200100020
+ .quad 0x0800080200000020, 0x0000000000000000
+ .quad 0x0800000000000000, 0x0000080000000000
+ .quad 0x0001080200100020, 0x0801000000100000
+ .quad 0x0001000200100020, 0x0800000200000020
+ .quad 0x0000000000000000, 0x0001080000100000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0801000000100000, 0x0000080200000020
+ .quad 0x0000000000000000, 0x0001080200100020
+ .quad 0x0801000200100020, 0x0001000000100000
+ .quad 0x0800080200000020, 0x0801000000100000
+ .quad 0x0801080000100000, 0x0000080000000000
+ .quad 0x0801000000100000, 0x0800080000000000
+ .quad 0x0000000200000020, 0x0801080200100020
+ .quad 0x0001080200100020, 0x0000000200000020
+ .quad 0x0000080000000000, 0x0800000000000000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0001000000100000, 0x0800000200000020
+ .quad 0x0001000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0001000200100020
+ .quad 0x0001080000100000, 0x0000000000000000
+ .quad 0x0800080000000000, 0x0000080200000020
+ .quad 0x0800000000000000, 0x0801000200100020
+ .quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+ .quad 0x0000002000000208, 0x0000202008020200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000202000020208, 0x0000002008000200
+ .quad 0x0000200000020008, 0x0000000008000008
+ .quad 0x0000000008000008, 0x0000200000020000
+ .quad 0x0000202008020208, 0x0000200000020008
+ .quad 0x0000200008020000, 0x0000002000000208
+ .quad 0x0000000008000000, 0x0000000000000008
+ .quad 0x0000202008020200, 0x0000002000000200
+ .quad 0x0000202000020200, 0x0000200008020000
+ .quad 0x0000200008020008, 0x0000202000020208
+ .quad 0x0000002008000208, 0x0000202000020200
+ .quad 0x0000200000020000, 0x0000002008000208
+ .quad 0x0000000000000008, 0x0000202008020208
+ .quad 0x0000002000000200, 0x0000000008000000
+ .quad 0x0000202008020200, 0x0000000008000000
+ .quad 0x0000200000020008, 0x0000002000000208
+ .quad 0x0000200000020000, 0x0000202008020200
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000002000000200, 0x0000200000020008
+ .quad 0x0000202008020208, 0x0000002008000200
+ .quad 0x0000000008000008, 0x0000002000000200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000208, 0x0000200000020000
+ .quad 0x0000000008000000, 0x0000202008020208
+ .quad 0x0000000000000008, 0x0000202000020208
+ .quad 0x0000202000020200, 0x0000000008000008
+ .quad 0x0000200008020000, 0x0000002008000208
+ .quad 0x0000002000000208, 0x0000200008020000
+ .quad 0x0000202000020208, 0x0000000000000008
+ .quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x0000020000002000, 0x0008020800002000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+ .quad 0x0000001000000100, 0x0020001002080100
+ .quad 0x0020000002080000, 0x0420001002000100
+ .quad 0x0000000000080000, 0x0000001000000100
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0400001000080100, 0x0000000000080000
+ .quad 0x0020001002000100, 0x0400001000080100
+ .quad 0x0420001002000100, 0x0420000002080000
+ .quad 0x0000001000080100, 0x0400000000000000
+ .quad 0x0020000002000000, 0x0400000000080000
+ .quad 0x0400000000080000, 0x0000000000000000
+ .quad 0x0400001000000100, 0x0420001002080100
+ .quad 0x0420001002080100, 0x0020001002000100
+ .quad 0x0420000002080000, 0x0400001000000100
+ .quad 0x0000000000000000, 0x0420000002000000
+ .quad 0x0020001002080100, 0x0020000002000000
+ .quad 0x0420000002000000, 0x0000001000080100
+ .quad 0x0000000000080000, 0x0420001002000100
+ .quad 0x0000001000000100, 0x0020000002000000
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0420001002000100, 0x0400001000080100
+ .quad 0x0020001002000100, 0x0400000000000000
+ .quad 0x0420000002080000, 0x0020001002080100
+ .quad 0x0400001000080100, 0x0000001000000100
+ .quad 0x0020000002000000, 0x0420000002080000
+ .quad 0x0420001002080100, 0x0000001000080100
+ .quad 0x0420000002000000, 0x0420001002080100
+ .quad 0x0020000002080000, 0x0000000000000000
+ .quad 0x0400000000080000, 0x0420000002000000
+ .quad 0x0000001000080100, 0x0020001002000100
+ .quad 0x0400001000000100, 0x0000000000080000
+ .quad 0x0000000000000000, 0x0400000000080000
+ .quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+ .quad 0x0200000120000010, 0x0204000020000000
+ .quad 0x0000040000000000, 0x0204040120000010
+ .quad 0x0204000020000000, 0x0000000100000010
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0200040020000000, 0x0004040100000010
+ .quad 0x0004000000000000, 0x0200000120000010
+ .quad 0x0004000100000010, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0000000000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000040000000000
+ .quad 0x0004040000000000, 0x0200040120000010
+ .quad 0x0000000100000010, 0x0204000120000010
+ .quad 0x0204000120000010, 0x0000000000000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000040100000010, 0x0004040000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0200040020000000, 0x0000000100000010
+ .quad 0x0204000120000010, 0x0004040000000000
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0000040100000010, 0x0200000120000010
+ .quad 0x0004000000000000, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0200000120000010, 0x0204040120000010
+ .quad 0x0004040000000000, 0x0204000020000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000000000000000, 0x0204000120000010
+ .quad 0x0000000100000010, 0x0000040000000000
+ .quad 0x0204000020000000, 0x0004040100000010
+ .quad 0x0000040000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000000000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+ .quad 0x0002000000200000, 0x2002000004200002
+ .quad 0x2000000004000802, 0x0000000000000000
+ .quad 0x0000000000000800, 0x2000000004000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2002000004200802, 0x0002000000200000
+ .quad 0x0000000000000000, 0x2000000004000002
+ .quad 0x2000000000000002, 0x0000000004000000
+ .quad 0x2002000004200002, 0x2000000000000802
+ .quad 0x0000000004000800, 0x2002000000200802
+ .quad 0x2002000000200002, 0x0000000004000800
+ .quad 0x2000000004000002, 0x0002000004200000
+ .quad 0x0002000004200800, 0x2002000000200002
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000000000802, 0x2002000004200802
+ .quad 0x0002000000200800, 0x2000000000000002
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0002000000200000, 0x2000000004000802
+ .quad 0x2000000004000802, 0x2002000004200002
+ .quad 0x2002000004200002, 0x2000000000000002
+ .quad 0x2002000000200002, 0x0000000004000000
+ .quad 0x0000000004000800, 0x0002000000200000
+ .quad 0x0002000004200800, 0x2000000000000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2000000000000802, 0x2000000004000002
+ .quad 0x2002000004200802, 0x0002000004200000
+ .quad 0x0002000000200800, 0x0000000000000000
+ .quad 0x2000000000000002, 0x2002000004200802
+ .quad 0x0000000000000000, 0x2002000000200802
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000004000002, 0x0000000004000800
+ .quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+ .quad 0x0100010410001000, 0x0000010000001000
+ .quad 0x0000000000040000, 0x0100010410041000
+ .quad 0x0100000010000000, 0x0100010410001000
+ .quad 0x0000000400000000, 0x0100000010000000
+ .quad 0x0000000400040000, 0x0100000010040000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0100010010041000, 0x0000010400041000
+ .quad 0x0000010000001000, 0x0000000400000000
+ .quad 0x0100000010040000, 0x0100000410000000
+ .quad 0x0100010010001000, 0x0000010400001000
+ .quad 0x0000010000041000, 0x0000000400040000
+ .quad 0x0100000410040000, 0x0100010010041000
+ .quad 0x0000010400001000, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0100000410040000
+ .quad 0x0100000410000000, 0x0100010010001000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0100010010041000, 0x0000010000001000
+ .quad 0x0000000400000000, 0x0100000410040000
+ .quad 0x0000010000001000, 0x0000010400041000
+ .quad 0x0100010010001000, 0x0000000400000000
+ .quad 0x0100000410000000, 0x0100000010040000
+ .quad 0x0100000410040000, 0x0100000010000000
+ .quad 0x0000000000040000, 0x0100010410001000
+ .quad 0x0000000000000000, 0x0100010410041000
+ .quad 0x0000000400040000, 0x0100000410000000
+ .quad 0x0100000010040000, 0x0100010010001000
+ .quad 0x0100010410001000, 0x0000000000000000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0000010000041000, 0x0000010400001000
+ .quad 0x0000010400001000, 0x0000000400040000
+ .quad 0x0100000010000000, 0x0100010010041000
diff --git a/marvell/linux/arch/x86/crypto/des3_ede_glue.c b/marvell/linux/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 0000000..89830e5
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/des.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+struct des3_ede_x86_ctx {
+ struct des3_ede_ctx enc;
+ struct des3_ede_ctx dec;
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc.expkey;
+
+ des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec.expkey;
+
+ des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc.expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec.expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct skcipher_request *req, const u32 *expkey)
+{
+ const unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+ wsrc);
+
+ wsrc += bsize * 3;
+ wdst += bsize * 3;
+ nbytes -= bsize * 3;
+ } while (nbytes >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_crypt(req, ctx->enc.expkey);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_crypt(req, ctx->dec.expkey);
+}
+
+static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[3 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ nbytes -= bsize * 3 - bsize;
+ src -= 3 - 1;
+ dst -= 3 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 3);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[DES3_EDE_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ des3_ede_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ __be64 *src = (__be64 *)walk->src.virt.addr;
+ __be64 *dst = (__be64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[3];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
+ (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+ dst[1] = src[1] ^ ctrblocks[1];
+ dst[2] = src[2] ^ ctrblocks[2];
+
+ src += 3;
+ dst += 3;
+ } while ((nbytes -= bsize * 3) >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ if (nbytes) {
+ ctr_crypt_final(ctx, &walk);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 i, j, tmp;
+ int err;
+
+ err = des3_ede_expand_key(&ctx->enc, key, keylen);
+ if (err == -ENOKEY) {
+ if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)
+ err = -EINVAL;
+ else
+ err = 0;
+ }
+
+ if (err) {
+ memset(ctx, 0, sizeof(*ctx));
+ return err;
+ }
+
+ /* Fix encryption context for this implementation and form decryption
+ * context. */
+ j = DES3_EDE_EXPKEY_WORDS - 2;
+ for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+ tmp = ror32(ctx->enc.expkey[i + 1], 4);
+ ctx->enc.expkey[i + 1] = tmp;
+
+ ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0];
+ ctx->dec.expkey[j + 1] = tmp;
+ }
+
+ return 0;
+}
+
+static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key,
+ unsigned int keylen)
+{
+ return des3_ede_x86_setkey(&tfm->base, key, keylen);
+}
+
+static struct crypto_alg des3_ede_cipher = {
+ .cra_name = "des3_ede",
+ .cra_driver_name = "des3_ede-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = DES3_EDE_KEY_SIZE,
+ .cia_max_keysize = DES3_EDE_KEY_SIZE,
+ .cia_setkey = des3_ede_x86_setkey,
+ .cia_encrypt = des3_ede_x86_encrypt,
+ .cia_decrypt = des3_ede_x86_decrypt,
+ }
+ }
+};
+
+static struct skcipher_alg des3_ede_skciphers[] = {
+ {
+ .base.cra_name = "ecb(des3_ede)",
+ .base.cra_driver_name = "ecb-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(des3_ede)",
+ .base.cra_driver_name = "cbc-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(des3_ede)",
+ .base.cra_driver_name = "ctr-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .chunksize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, des3_ede-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&des3_ede_cipher);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(des3_ede_skciphers,
+ ARRAY_SIZE(des3_ede_skciphers));
+ if (err)
+ crypto_unregister_alg(&des3_ede_cipher);
+
+ return err;
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+ crypto_unregister_alg(&des3_ede_cipher);
+ crypto_unregister_skciphers(des3_ede_skciphers,
+ ARRAY_SIZE(des3_ede_skciphers));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("des3_ede");
+MODULE_ALIAS_CRYPTO("des3_ede-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/marvell/linux/arch/x86/crypto/ghash-clmulni-intel_asm.S b/marvell/linux/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 0000000..5d53eff
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal
+ * Erdinc Ozturk
+ * Deniz Karakoyunlu
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+#include <asm/frame.h>
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+#define DATA %xmm0
+#define SHASH %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP %xmm5
+#define IN1 %xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble: internal ABI
+ * input:
+ * DATA: operand1
+ * SHASH: operand2, hash_key << 1 mod poly
+ * output:
+ * DATA: operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+__clmul_gf128mul_ble:
+ movaps DATA, T1
+ pshufd $0b01001110, DATA, T2
+ pshufd $0b01001110, SHASH, T3
+ pxor DATA, T2
+ pxor SHASH, T3
+
+ PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
+ PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
+ PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
+ pxor DATA, T2
+ pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+ movaps T2, T3
+ pslldq $8, T3
+ psrldq $8, T2
+ pxor T3, DATA
+ pxor T2, T1 # <T1:DATA> is result of
+ # carry-less multiplication
+
+ # first phase of the reduction
+ movaps DATA, T3
+ psllq $1, T3
+ pxor DATA, T3
+ psllq $5, T3
+ pxor DATA, T3
+ psllq $57, T3
+ movaps T3, T2
+ pslldq $8, T2
+ psrldq $8, T3
+ pxor T2, DATA
+ pxor T3, T1
+
+ # second phase of the reduction
+ movaps DATA, T2
+ psrlq $5, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor T2, T1
+ pxor T1, DATA
+ ret
+ENDPROC(__clmul_gf128mul_ble)
+
+/* void clmul_ghash_mul(char *dst, const u128 *shash) */
+ENTRY(clmul_ghash_mul)
+ FRAME_BEGIN
+ movups (%rdi), DATA
+ movups (%rsi), SHASH
+ movaps .Lbswap_mask, BSWAP
+ PSHUFB_XMM BSWAP DATA
+ call __clmul_gf128mul_ble
+ PSHUFB_XMM BSWAP DATA
+ movups DATA, (%rdi)
+ FRAME_END
+ ret
+ENDPROC(clmul_ghash_mul)
+
+/*
+ * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ * const u128 *shash);
+ */
+ENTRY(clmul_ghash_update)
+ FRAME_BEGIN
+ cmp $16, %rdx
+ jb .Lupdate_just_ret # check length
+ movaps .Lbswap_mask, BSWAP
+ movups (%rdi), DATA
+ movups (%rcx), SHASH
+ PSHUFB_XMM BSWAP DATA
+.align 4
+.Lupdate_loop:
+ movups (%rsi), IN1
+ PSHUFB_XMM BSWAP IN1
+ pxor IN1, DATA
+ call __clmul_gf128mul_ble
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lupdate_loop
+ PSHUFB_XMM BSWAP DATA
+ movups DATA, (%rdi)
+.Lupdate_just_ret:
+ FRAME_END
+ ret
+ENDPROC(clmul_ghash_update)
diff --git a/marvell/linux/arch/x86/crypto/ghash-clmulni-intel_glue.c b/marvell/linux/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 0000000..c9864ac
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains glue code.
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+
+#define GHASH_BLOCK_SIZE 16
+#define GHASH_DIGEST_SIZE 16
+
+void clmul_ghash_mul(char *dst, const u128 *shash);
+
+void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ const u128 *shash);
+
+struct ghash_async_ctx {
+ struct cryptd_ahash *cryptd_tfm;
+};
+
+struct ghash_ctx {
+ u128 shash;
+};
+
+struct ghash_desc_ctx {
+ u8 buffer[GHASH_BLOCK_SIZE];
+ u32 bytes;
+};
+
+static int ghash_init(struct shash_desc *desc)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int ghash_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+ u64 a, b;
+
+ if (keylen != GHASH_BLOCK_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ /* perform multiplication by 'x' in GF(2^128) */
+ a = get_unaligned_be64(key);
+ b = get_unaligned_be64(key + 8);
+
+ ctx->shash.a = (b << 1) | (a >> 63);
+ ctx->shash.b = (a << 1) | (b >> 63);
+
+ if (a >> 63)
+ ctx->shash.b ^= ((u64)0xc2) << 56;
+
+ return 0;
+}
+
+static int ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *dst = dctx->buffer;
+
+ kernel_fpu_begin();
+ if (dctx->bytes) {
+ int n = min(srclen, dctx->bytes);
+ u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ clmul_ghash_mul(dst, &ctx->shash);
+ }
+
+ clmul_ghash_update(dst, src, srclen, &ctx->shash);
+ kernel_fpu_end();
+
+ if (srclen & 0xf) {
+ src += srclen - (srclen & 0xf);
+ srclen &= 0xf;
+ dctx->bytes = GHASH_BLOCK_SIZE - srclen;
+ while (srclen--)
+ *dst++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+{
+ u8 *dst = dctx->buffer;
+
+ if (dctx->bytes) {
+ u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ while (dctx->bytes--)
+ *tmp++ ^= 0;
+
+ kernel_fpu_begin();
+ clmul_ghash_mul(dst, &ctx->shash);
+ kernel_fpu_end();
+ }
+
+ dctx->bytes = 0;
+}
+
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *buf = dctx->buffer;
+
+ ghash_flush(ctx, dctx);
+ memcpy(dst, buf, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update,
+ .final = ghash_final,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "__ghash",
+ .cra_driver_name = "__ghash-pclmulqdqni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int ghash_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ return crypto_shash_init(desc);
+}
+
+static int ghash_async_update(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!crypto_simd_usable() ||
+ (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_update(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return shash_ahash_update(req, desc);
+ }
+}
+
+static int ghash_async_final(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!crypto_simd_usable() ||
+ (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_final(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return crypto_shash_final(desc, req->result);
+ }
+}
+
+static int ghash_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ ghash_async_init(req);
+ memcpy(dctx, in, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memcpy(out, dctx, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!crypto_simd_usable() ||
+ (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_digest(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ return shash_ahash_digest(req, desc);
+ }
+}
+
+static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct crypto_ahash *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ahash_setkey(child, key, keylen);
+ crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+
+ return err;
+}
+
+static int ghash_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct cryptd_ahash *cryptd_tfm;
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ctx->cryptd_tfm = cryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&cryptd_tfm->base));
+
+ return 0;
+}
+
+static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ahash(ctx->cryptd_tfm);
+}
+
+static struct ahash_alg ghash_async_alg = {
+ .init = ghash_async_init,
+ .update = ghash_async_update,
+ .final = ghash_async_final,
+ .setkey = ghash_async_setkey,
+ .digest = ghash_async_digest,
+ .export = ghash_async_export,
+ .import = ghash_async_import,
+ .halg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .statesize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "ghash-clmulni",
+ .cra_priority = 400,
+ .cra_ctxsize = sizeof(struct ghash_async_ctx),
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_init = ghash_async_init_tfm,
+ .cra_exit = ghash_async_exit_tfm,
+ },
+ },
+};
+
+static const struct x86_cpu_id pcmul_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
+static int __init ghash_pclmulqdqni_mod_init(void)
+{
+ int err;
+
+ if (!x86_match_cpu(pcmul_cpu_id))
+ return -ENODEV;
+
+ err = crypto_register_shash(&ghash_alg);
+ if (err)
+ goto err_out;
+ err = crypto_register_ahash(&ghash_async_alg);
+ if (err)
+ goto err_shash;
+
+ return 0;
+
+err_shash:
+ crypto_unregister_shash(&ghash_alg);
+err_out:
+ return err;
+}
+
+static void __exit ghash_pclmulqdqni_mod_exit(void)
+{
+ crypto_unregister_ahash(&ghash_async_alg);
+ crypto_unregister_shash(&ghash_alg);
+}
+
+module_init(ghash_pclmulqdqni_mod_init);
+module_exit(ghash_pclmulqdqni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/marvell/linux/arch/x86/crypto/glue_helper-asm-avx.S b/marvell/linux/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 0000000..d08fc57
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Shared glue code for 128bit block ciphers, AVX assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*16)(src), x0; \
+ vmovdqu (1*16)(src), x1; \
+ vmovdqu (2*16)(src), x2; \
+ vmovdqu (3*16)(src), x3; \
+ vmovdqu (4*16)(src), x4; \
+ vmovdqu (5*16)(src), x5; \
+ vmovdqu (6*16)(src), x6; \
+ vmovdqu (7*16)(src), x7;
+
+#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*16)(dst); \
+ vmovdqu x1, (1*16)(dst); \
+ vmovdqu x2, (2*16)(dst); \
+ vmovdqu x3, (3*16)(dst); \
+ vmovdqu x4, (4*16)(dst); \
+ vmovdqu x5, (5*16)(dst); \
+ vmovdqu x6, (6*16)(dst); \
+ vmovdqu x7, (7*16)(dst);
+
+#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x1, x1; \
+ vpxor (1*16)(src), x2, x2; \
+ vpxor (2*16)(src), x3, x3; \
+ vpxor (3*16)(src), x4, x4; \
+ vpxor (4*16)(src), x5, x5; \
+ vpxor (5*16)(src), x6, x6; \
+ vpxor (6*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
+ vpcmpeqd t0, t0, t0; \
+ vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
+ vmovdqa bswap, t1; \
+ \
+ /* load IV and byteswap */ \
+ vmovdqu (iv), x7; \
+ vpshufb t1, x7, x0; \
+ \
+ /* construct IVs */ \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x1; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x2; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x3; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x4; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x5; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x6; \
+ inc_le128(x7, t0, t2); \
+ vmovdqa x7, t2; \
+ vpshufb t1, x7, x7; \
+ inc_le128(t2, t0, t1); \
+ vmovdqu t2, (iv);
+
+#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x0, x0; \
+ vpxor (1*16)(src), x1, x1; \
+ vpxor (2*16)(src), x2, x2; \
+ vpxor (3*16)(src), x3, x3; \
+ vpxor (4*16)(src), x4, x4; \
+ vpxor (5*16)(src), x5, x5; \
+ vpxor (6*16)(src), x6, x6; \
+ vpxor (7*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+ t1, xts_gf128mul_and_shl1_mask) \
+ vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+ \
+ /* load IV */ \
+ vmovdqu (iv), tiv; \
+ vpxor (0*16)(src), tiv, x0; \
+ vmovdqu tiv, (0*16)(dst); \
+ \
+ /* construct and store IVs, also xor with source */ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (1*16)(src), tiv, x1; \
+ vmovdqu tiv, (1*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (2*16)(src), tiv, x2; \
+ vmovdqu tiv, (2*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (3*16)(src), tiv, x3; \
+ vmovdqu tiv, (3*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (4*16)(src), tiv, x4; \
+ vmovdqu tiv, (4*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (5*16)(src), tiv, x5; \
+ vmovdqu tiv, (5*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (6*16)(src), tiv, x6; \
+ vmovdqu tiv, (6*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (7*16)(src), tiv, x7; \
+ vmovdqu tiv, (7*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vmovdqu tiv, (iv);
+
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(dst), x0, x0; \
+ vpxor (1*16)(dst), x1, x1; \
+ vpxor (2*16)(dst), x2, x2; \
+ vpxor (3*16)(dst), x3, x3; \
+ vpxor (4*16)(dst), x4, x4; \
+ vpxor (5*16)(dst), x5, x5; \
+ vpxor (6*16)(dst), x6, x6; \
+ vpxor (7*16)(dst), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/marvell/linux/arch/x86/crypto/glue_helper-asm-avx2.S b/marvell/linux/arch/x86/crypto/glue_helper-asm-avx2.S
new file mode 100644
index 0000000..d84508c
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*32)(src), x0; \
+ vmovdqu (1*32)(src), x1; \
+ vmovdqu (2*32)(src), x2; \
+ vmovdqu (3*32)(src), x3; \
+ vmovdqu (4*32)(src), x4; \
+ vmovdqu (5*32)(src), x5; \
+ vmovdqu (6*32)(src), x6; \
+ vmovdqu (7*32)(src), x7;
+
+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*32)(dst); \
+ vmovdqu x1, (1*32)(dst); \
+ vmovdqu x2, (2*32)(dst); \
+ vmovdqu x3, (3*32)(dst); \
+ vmovdqu x4, (4*32)(dst); \
+ vmovdqu x5, (5*32)(dst); \
+ vmovdqu x6, (6*32)(dst); \
+ vmovdqu x7, (7*32)(dst);
+
+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
+ vpxor t0, t0, t0; \
+ vinserti128 $1, (src), t0, t0; \
+ vpxor t0, x0, x0; \
+ vpxor (0*32+16)(src), x1, x1; \
+ vpxor (1*32+16)(src), x2, x2; \
+ vpxor (2*32+16)(src), x3, x3; \
+ vpxor (3*32+16)(src), x4, x4; \
+ vpxor (4*32+16)(src), x5, x5; \
+ vpxor (5*32+16)(src), x6, x6; \
+ vpxor (6*32+16)(src), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+ vpcmpeqq minus_one, x, tmp1; \
+ vpcmpeqq minus_two, x, tmp2; \
+ vpsubq minus_two, x, x; \
+ vpor tmp2, tmp1, tmp1; \
+ vpslldq $8, tmp1, tmp1; \
+ vpsubq tmp1, x, x;
+
+#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
+ t1x, t2, t2x, t3, t3x, t4, t5) \
+ vpcmpeqd t0, t0, t0; \
+ vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
+ vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
+ \
+ /* load IV and byteswap */ \
+ vmovdqu (iv), t2x; \
+ vmovdqa t2x, t3x; \
+ inc_le128(t2x, t0x, t1x); \
+ vbroadcasti128 bswap, t1; \
+ vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
+ vpshufb t1, t2, x0; \
+ \
+ /* construct IVs */ \
+ add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
+ vpshufb t1, t2, x1; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x2; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x3; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x4; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x5; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x6; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x7; \
+ vextracti128 $1, t2, t2x; \
+ inc_le128(t2x, t0x, t3x); \
+ vmovdqu t2x, (iv);
+
+#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*32)(src), x0, x0; \
+ vpxor (1*32)(src), x1, x1; \
+ vpxor (2*32)(src), x2, x2; \
+ vpxor (3*32)(src), x3, x3; \
+ vpxor (4*32)(src), x4, x4; \
+ vpxor (5*32)(src), x5, x5; \
+ vpxor (6*32)(src), x6, x6; \
+ vpxor (7*32)(src), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+ vpsrad $31, iv, tmp0; \
+ vpaddq iv, iv, tmp1; \
+ vpsllq $2, iv, iv; \
+ vpshufd $0x13, tmp0, tmp0; \
+ vpsrad $31, tmp1, tmp1; \
+ vpand mask2, tmp0, tmp0; \
+ vpshufd $0x13, tmp1, tmp1; \
+ vpxor tmp0, iv, iv; \
+ vpand mask1, tmp1, tmp1; \
+ vpxor tmp1, iv, iv;
+
+#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
+ tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
+ xts_gf128mul_and_shl1_mask_0, \
+ xts_gf128mul_and_shl1_mask_1) \
+ vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
+ \
+ /* load IV and construct second IV */ \
+ vmovdqu (iv), tivx; \
+ vmovdqa tivx, t0x; \
+ gf128mul_x_ble(tivx, t1x, t2x); \
+ vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
+ vinserti128 $1, tivx, t0, tiv; \
+ vpxor (0*32)(src), tiv, x0; \
+ vmovdqu tiv, (0*32)(dst); \
+ \
+ /* construct and store IVs, also xor with source */ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (1*32)(src), tiv, x1; \
+ vmovdqu tiv, (1*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (2*32)(src), tiv, x2; \
+ vmovdqu tiv, (2*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (3*32)(src), tiv, x3; \
+ vmovdqu tiv, (3*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (4*32)(src), tiv, x4; \
+ vmovdqu tiv, (4*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (5*32)(src), tiv, x5; \
+ vmovdqu tiv, (5*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (6*32)(src), tiv, x6; \
+ vmovdqu tiv, (6*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (7*32)(src), tiv, x7; \
+ vmovdqu tiv, (7*32)(dst); \
+ \
+ vextracti128 $1, tiv, tivx; \
+ gf128mul_x_ble(tivx, t1x, t2x); \
+ vmovdqu tivx, (iv);
+
+#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*32)(dst), x0, x0; \
+ vpxor (1*32)(dst), x1, x1; \
+ vpxor (2*32)(dst), x2, x2; \
+ vpxor (3*32)(dst), x3, x3; \
+ vpxor (4*32)(dst), x4, x4; \
+ vpxor (5*32)(dst), x5, x5; \
+ vpxor (6*32)(dst), x6, x6; \
+ vpxor (7*32)(dst), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/marvell/linux/arch/x86/crypto/glue_helper.c b/marvell/linux/arch/x86/crypto/glue_helper.c
new file mode 100644
index 0000000..d3d91a0
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/glue_helper.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Shared glue code for 128bit block ciphers
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+
+int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int func_bytes;
+ unsigned int i;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled, nbytes);
+ for (i = 0; i < gctx->num_funcs; i++) {
+ func_bytes = bsize * gctx->funcs[i].num_blocks;
+
+ if (nbytes < func_bytes)
+ continue;
+
+ /* Process multi-block batch */
+ do {
+ gctx->funcs[i].fn_u.ecb(ctx, dst, src);
+ src += func_bytes;
+ dst += func_bytes;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ break;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_ecb_req_128bit);
+
+int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u128 *src = (u128 *)walk.src.virt.addr;
+ u128 *dst = (u128 *)walk.dst.virt.addr;
+ u128 *iv = (u128 *)walk.iv;
+
+ do {
+ u128_xor(dst, src, iv);
+ fn(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+ src++;
+ dst++;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u128 *)walk.iv = *iv;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit);
+
+int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u128 *src = walk.src.virt.addr;
+ u128 *dst = walk.dst.virt.addr;
+ unsigned int func_bytes, num_blocks;
+ unsigned int i;
+ u128 last_iv;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled, nbytes);
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes < func_bytes)
+ continue;
+
+ /* Process multi-block batch */
+ do {
+ src -= num_blocks - 1;
+ dst -= num_blocks - 1;
+
+ gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst,
+ (const u8 *)src);
+
+ nbytes -= func_bytes;
+ if (nbytes < bsize)
+ goto done;
+
+ u128_xor(dst, dst, --src);
+ dst--;
+ } while (nbytes >= func_bytes);
+ }
+done:
+ u128_xor(dst, dst, (u128 *)walk.iv);
+ *(u128 *)walk.iv = last_iv;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit);
+
+int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= bsize) {
+ const u128 *src = walk.src.virt.addr;
+ u128 *dst = walk.dst.virt.addr;
+ unsigned int func_bytes, num_blocks;
+ unsigned int i;
+ le128 ctrblk;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled, nbytes);
+
+ be128_to_le128(&ctrblk, (be128 *)walk.iv);
+
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes < func_bytes)
+ continue;
+
+ /* Process multi-block batch */
+ do {
+ gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst,
+ (const u8 *)src,
+ &ctrblk);
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ break;
+ }
+
+ le128_to_be128((be128 *)walk.iv, &ctrblk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+
+ if (nbytes) {
+ le128 ctrblk;
+ u128 tmp;
+
+ be128_to_le128(&ctrblk, (be128 *)walk.iv);
+ memcpy(&tmp, walk.src.virt.addr, nbytes);
+ gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp,
+ (const u8 *)&tmp,
+ &ctrblk);
+ memcpy(walk.dst.virt.addr, &tmp, nbytes);
+ le128_to_be128((be128 *)walk.iv, &ctrblk);
+
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_ctr_req_128bit);
+
+static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ void *ctx,
+ struct skcipher_walk *walk)
+{
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = walk->src.virt.addr;
+ u128 *dst = walk->dst.virt.addr;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ /* Process multi-block batch */
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes >= func_bytes) {
+ do {
+ gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst,
+ (const u8 *)src,
+ walk->iv);
+
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ return nbytes;
+}
+
+int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx, bool decrypt)
+{
+ const bool cts = (req->cryptlen % XTS_BLOCK_SIZE);
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes, tail;
+ int err;
+
+ if (req->cryptlen < XTS_BLOCK_SIZE)
+ return -EINVAL;
+
+ if (unlikely(cts)) {
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+
+ tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ crypto_skcipher_get_flags(tfm),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ req->cryptlen - tail, req->iv);
+ req = &subreq;
+ }
+
+ err = skcipher_walk_virt(&walk, req, false);
+ nbytes = walk.nbytes;
+ if (err)
+ return err;
+
+ /* set minimum length to bsize, for tweak_fn */
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled,
+ nbytes < bsize ? bsize : nbytes);
+
+ /* calculate first value of T */
+ tweak_fn(tweak_ctx, walk.iv, walk.iv);
+
+ while (nbytes) {
+ nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ nbytes = walk.nbytes;
+ }
+
+ if (unlikely(cts)) {
+ u8 *next_tweak, *final_tweak = req->iv;
+ struct scatterlist *src, *dst;
+ struct scatterlist s[2], d[2];
+ le128 b[2];
+
+ dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(d, req->dst, req->cryptlen);
+
+ if (decrypt) {
+ next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE);
+ gf128mul_x_ble(b, b);
+ } else {
+ next_tweak = req->iv;
+ }
+
+ skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE,
+ next_tweak);
+
+ err = skcipher_walk_virt(&walk, req, false) ?:
+ skcipher_walk_done(&walk,
+ __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
+ if (err)
+ goto out;
+
+ scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0);
+ memcpy(b + 1, b, tail - XTS_BLOCK_SIZE);
+ scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE,
+ tail - XTS_BLOCK_SIZE, 0);
+ scatterwalk_map_and_copy(b, dst, 0, tail, 1);
+
+ skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE,
+ final_tweak);
+
+ err = skcipher_walk_virt(&walk, req, false) ?:
+ skcipher_walk_done(&walk,
+ __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
+ }
+
+out:
+ glue_fpu_end(fpu_enabled);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
+
+void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv, common_glue_func_t fn)
+{
+ le128 ivblk = *iv;
+
+ /* generate next IV */
+ gf128mul_x_ble(iv, &ivblk);
+
+ /* CC <- T xor C */
+ u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk);
+
+ /* PP <- D(Key2,CC) */
+ fn(ctx, dst, dst);
+
+ /* P <- T xor PP */
+ u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk);
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
+
+MODULE_LICENSE("GPL");
diff --git a/marvell/linux/arch/x86/crypto/nh-avx2-x86_64.S b/marvell/linux/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index 0000000..f7946ea
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define PASS0_SUMS %ymm0
+#define PASS1_SUMS %ymm1
+#define PASS2_SUMS %ymm2
+#define PASS3_SUMS %ymm3
+#define K0 %ymm4
+#define K0_XMM %xmm4
+#define K1 %ymm5
+#define K1_XMM %xmm5
+#define K2 %ymm6
+#define K2_XMM %xmm6
+#define K3 %ymm7
+#define K3_XMM %xmm7
+#define T0 %ymm8
+#define T1 %ymm9
+#define T2 %ymm10
+#define T2_XMM %xmm10
+#define T3 %ymm11
+#define T3_XMM %xmm11
+#define T4 %ymm12
+#define T5 %ymm13
+#define T6 %ymm14
+#define T7 %ymm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_2xstride k0, k1, k2, k3
+
+ // Add message words to key words
+ vpaddd \k0, T3, T0
+ vpaddd \k1, T3, T1
+ vpaddd \k2, T3, T2
+ vpaddd \k3, T3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ vpshufd $0x10, T0, T4
+ vpshufd $0x32, T0, T0
+ vpshufd $0x10, T1, T5
+ vpshufd $0x32, T1, T1
+ vpshufd $0x10, T2, T6
+ vpshufd $0x32, T2, T2
+ vpshufd $0x10, T3, T7
+ vpshufd $0x32, T3, T3
+ vpmuludq T4, T0, T0
+ vpmuludq T5, T1, T1
+ vpmuludq T6, T2, T2
+ vpmuludq T7, T3, T3
+ vpaddq T0, PASS0_SUMS, PASS0_SUMS
+ vpaddq T1, PASS1_SUMS, PASS1_SUMS
+ vpaddq T2, PASS2_SUMS, PASS2_SUMS
+ vpaddq T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_avx2)
+
+ vmovdqu 0x00(KEY), K0
+ vmovdqu 0x10(KEY), K1
+ add $0x20, KEY
+ vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+ vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+ vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+ vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+
+ vmovdqu 0x20(MESSAGE), T3
+ vmovdqu 0x20(KEY), K0
+ vmovdqu 0x30(KEY), K1
+ _nh_2xstride K2, K3, K0, K1
+
+ add $0x40, MESSAGE
+ add $0x40, KEY
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+
+ cmp $0x20, MESSAGE_LEN
+ jl .Llast
+
+ // 2 or 3 strides remain; do 2 more.
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+ add $0x20, MESSAGE
+ add $0x20, KEY
+ sub $0x20, MESSAGE_LEN
+ jz .Ldone
+ vmovdqa K2, K0
+ vmovdqa K3, K1
+.Llast:
+ // Last stride. Zero the high 128 bits of the message and keys so they
+ // don't affect the result when processing them like 2 strides.
+ vmovdqu (MESSAGE), T3_XMM
+ vmovdqa K0_XMM, K0_XMM
+ vmovdqa K1_XMM, K1_XMM
+ vmovdqu 0x00(KEY), K2_XMM
+ vmovdqu 0x10(KEY), K3_XMM
+ _nh_2xstride K0, K1, K2, K3
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+
+ // PASS0_SUMS is (0A 0B 0C 0D)
+ // PASS1_SUMS is (1A 1B 1C 1D)
+ // PASS2_SUMS is (2A 2B 2C 2D)
+ // PASS3_SUMS is (3A 3B 3C 3D)
+ // We need the horizontal sums:
+ // (0A + 0B + 0C + 0D,
+ // 1A + 1B + 1C + 1D,
+ // 2A + 2B + 2C + 2D,
+ // 3A + 3B + 3C + 3D)
+ //
+
+ vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
+ vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
+ vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
+ vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
+
+ vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
+ vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
+ vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
+ vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
+
+ vpaddq T5, T4, T4
+ vpaddq T1, T0, T0
+ vpaddq T4, T0, T0
+ vmovdqu T0, (HASH)
+ ret
+ENDPROC(nh_avx2)
diff --git a/marvell/linux/arch/x86/crypto/nh-sse2-x86_64.S b/marvell/linux/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index 0000000..51f52d4
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define PASS0_SUMS %xmm0
+#define PASS1_SUMS %xmm1
+#define PASS2_SUMS %xmm2
+#define PASS3_SUMS %xmm3
+#define K0 %xmm4
+#define K1 %xmm5
+#define K2 %xmm6
+#define K3 %xmm7
+#define T0 %xmm8
+#define T1 %xmm9
+#define T2 %xmm10
+#define T3 %xmm11
+#define T4 %xmm12
+#define T5 %xmm13
+#define T6 %xmm14
+#define T7 %xmm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_stride k0, k1, k2, k3, offset
+
+ // Load next message stride
+ movdqu \offset(MESSAGE), T1
+
+ // Load next key stride
+ movdqu \offset(KEY), \k3
+
+ // Add message words to key words
+ movdqa T1, T2
+ movdqa T1, T3
+ paddd T1, \k0 // reuse k0 to avoid a move
+ paddd \k1, T1
+ paddd \k2, T2
+ paddd \k3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ pshufd $0x10, \k0, T4
+ pshufd $0x32, \k0, \k0
+ pshufd $0x10, T1, T5
+ pshufd $0x32, T1, T1
+ pshufd $0x10, T2, T6
+ pshufd $0x32, T2, T2
+ pshufd $0x10, T3, T7
+ pshufd $0x32, T3, T3
+ pmuludq T4, \k0
+ pmuludq T5, T1
+ pmuludq T6, T2
+ pmuludq T7, T3
+ paddq \k0, PASS0_SUMS
+ paddq T1, PASS1_SUMS
+ paddq T2, PASS2_SUMS
+ paddq T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_sse2)
+
+ movdqu 0x00(KEY), K0
+ movdqu 0x10(KEY), K1
+ movdqu 0x20(KEY), K2
+ add $0x30, KEY
+ pxor PASS0_SUMS, PASS0_SUMS
+ pxor PASS1_SUMS, PASS1_SUMS
+ pxor PASS2_SUMS, PASS2_SUMS
+ pxor PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3, 0x00
+ _nh_stride K1, K2, K3, K0, 0x10
+ _nh_stride K2, K3, K0, K1, 0x20
+ _nh_stride K3, K0, K1, K2, 0x30
+ add $0x40, KEY
+ add $0x40, MESSAGE
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K0, K1, K2, K3, 0x00
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K1, K2, K3, K0, 0x10
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K2, K3, K0, K1, 0x20
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ movdqa PASS0_SUMS, T0
+ movdqa PASS2_SUMS, T1
+ punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
+ punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
+ punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
+ punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
+ paddq PASS0_SUMS, T0
+ paddq PASS2_SUMS, T1
+ movdqu T0, 0x00(HASH)
+ movdqu T1, 0x10(HASH)
+ ret
+ENDPROC(nh_sse2)
diff --git a/marvell/linux/arch/x86/crypto/nhpoly1305-avx2-glue.c b/marvell/linux/arch/x86/crypto/nhpoly1305-avx2-glue.c
new file mode 100644
index 0000000..80fcb85
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (AVX2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_avx2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_avx2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-avx2",
+ .base.cra_priority = 300,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_avx2_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
diff --git a/marvell/linux/arch/x86/crypto/nhpoly1305-sse2-glue.c b/marvell/linux/arch/x86/crypto/nhpoly1305-sse2-glue.c
new file mode 100644
index 0000000..cc6b7c1
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (SSE2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_sse2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_sse2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-sse2",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_sse2_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
diff --git a/marvell/linux/arch/x86/crypto/poly1305-avx2-x86_64.S b/marvell/linux/arch/x86/crypto/poly1305-avx2-x86_64.S
new file mode 100644
index 0000000..8b341bc
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ANMASK, "aM", @progbits, 32
+.align 32
+ANMASK: .octa 0x0000000003ffffff0000000003ffffff
+ .octa 0x0000000003ffffff0000000003ffffff
+
+.section .rodata.cst32.ORMASK, "aM", @progbits, 32
+.align 32
+ORMASK: .octa 0x00000000010000000000000001000000
+ .octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define w0 0x14(%r8)
+#define w1 0x18(%r8)
+#define w2 0x1c(%r8)
+#define w3 0x20(%r8)
+#define w4 0x24(%r8)
+#define y0 0x28(%r8)
+#define y1 0x2c(%r8)
+#define y2 0x30(%r8)
+#define y3 0x34(%r8)
+#define y4 0x38(%r8)
+#define m %rsi
+#define hc0 %ymm0
+#define hc1 %ymm1
+#define hc2 %ymm2
+#define hc3 %ymm3
+#define hc4 %ymm4
+#define hc0x %xmm0
+#define hc1x %xmm1
+#define hc2x %xmm2
+#define hc3x %xmm3
+#define hc4x %xmm4
+#define t1 %ymm5
+#define t2 %ymm6
+#define t1x %xmm5
+#define t2x %xmm6
+#define ruwy0 %ymm7
+#define ruwy1 %ymm8
+#define ruwy2 %ymm9
+#define ruwy3 %ymm10
+#define ruwy4 %ymm11
+#define ruwy0x %xmm7
+#define ruwy1x %xmm8
+#define ruwy2x %xmm9
+#define ruwy3x %xmm10
+#define ruwy4x %xmm11
+#define svxz1 %ymm12
+#define svxz2 %ymm13
+#define svxz3 %ymm14
+#define svxz4 %ymm15
+#define d0 %r9
+#define d1 %r10
+#define d2 %r11
+#define d3 %r12
+#define d4 %r13
+
+ENTRY(poly1305_4block_avx2)
+ # %rdi: Accumulator h[5]
+ # %rsi: 64 byte input block m
+ # %rdx: Poly1305 key r[5]
+ # %rcx: Quadblock count
+ # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
+
+ # This four-block variant uses loop unrolled block processing. It
+ # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
+ # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
+
+ vzeroupper
+ push %rbx
+ push %r12
+ push %r13
+
+ # combine r0,u0,w0,y0
+ vmovd y0,ruwy0x
+ vmovd w0,t1x
+ vpunpcklqdq t1,ruwy0,ruwy0
+ vmovd u0,t1x
+ vmovd r0,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy0,ruwy0
+
+ # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
+ vmovd y1,ruwy1x
+ vmovd w1,t1x
+ vpunpcklqdq t1,ruwy1,ruwy1
+ vmovd u1,t1x
+ vmovd r1,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy1,ruwy1
+ vpslld $2,ruwy1,svxz1
+ vpaddd ruwy1,svxz1,svxz1
+
+ # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
+ vmovd y2,ruwy2x
+ vmovd w2,t1x
+ vpunpcklqdq t1,ruwy2,ruwy2
+ vmovd u2,t1x
+ vmovd r2,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy2,ruwy2
+ vpslld $2,ruwy2,svxz2
+ vpaddd ruwy2,svxz2,svxz2
+
+ # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
+ vmovd y3,ruwy3x
+ vmovd w3,t1x
+ vpunpcklqdq t1,ruwy3,ruwy3
+ vmovd u3,t1x
+ vmovd r3,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy3,ruwy3
+ vpslld $2,ruwy3,svxz3
+ vpaddd ruwy3,svxz3,svxz3
+
+ # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
+ vmovd y4,ruwy4x
+ vmovd w4,t1x
+ vpunpcklqdq t1,ruwy4,ruwy4
+ vmovd u4,t1x
+ vmovd r4,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy4,ruwy4
+ vpslld $2,ruwy4,svxz4
+ vpaddd ruwy4,svxz4,svxz4
+
+.Ldoblock4:
+ # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
+ # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
+ vmovd 0x00(m),hc0x
+ vmovd 0x10(m),t1x
+ vpunpcklqdq t1,hc0,hc0
+ vmovd 0x20(m),t1x
+ vmovd 0x30(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc0,hc0
+ vpand ANMASK(%rip),hc0,hc0
+ vmovd h0,t1x
+ vpaddd t1,hc0,hc0
+ # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
+ # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
+ vmovd 0x03(m),hc1x
+ vmovd 0x13(m),t1x
+ vpunpcklqdq t1,hc1,hc1
+ vmovd 0x23(m),t1x
+ vmovd 0x33(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc1,hc1
+ vpsrld $2,hc1,hc1
+ vpand ANMASK(%rip),hc1,hc1
+ vmovd h1,t1x
+ vpaddd t1,hc1,hc1
+ # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
+ # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
+ vmovd 0x06(m),hc2x
+ vmovd 0x16(m),t1x
+ vpunpcklqdq t1,hc2,hc2
+ vmovd 0x26(m),t1x
+ vmovd 0x36(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc2,hc2
+ vpsrld $4,hc2,hc2
+ vpand ANMASK(%rip),hc2,hc2
+ vmovd h2,t1x
+ vpaddd t1,hc2,hc2
+ # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
+ # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
+ vmovd 0x09(m),hc3x
+ vmovd 0x19(m),t1x
+ vpunpcklqdq t1,hc3,hc3
+ vmovd 0x29(m),t1x
+ vmovd 0x39(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc3,hc3
+ vpsrld $6,hc3,hc3
+ vpand ANMASK(%rip),hc3,hc3
+ vmovd h3,t1x
+ vpaddd t1,hc3,hc3
+ # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
+ # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
+ vmovd 0x0c(m),hc4x
+ vmovd 0x1c(m),t1x
+ vpunpcklqdq t1,hc4,hc4
+ vmovd 0x2c(m),t1x
+ vmovd 0x3c(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc4,hc4
+ vpsrld $8,hc4,hc4
+ vpor ORMASK(%rip),hc4,hc4
+ vmovd h4,t1x
+ vpaddd t1,hc4,hc4
+
+ # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
+ vpmuludq hc0,ruwy0,t1
+ # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
+ vpmuludq hc1,svxz4,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
+ vpmuludq hc2,svxz3,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
+ vpmuludq hc3,svxz2,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
+ vpmuludq hc4,svxz1,t2
+ vpaddq t2,t1,t1
+ # d0 = t1[0] + t1[1] + t[2] + t[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d0
+
+ # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
+ vpmuludq hc0,ruwy1,t1
+ # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
+ vpmuludq hc1,ruwy0,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
+ vpmuludq hc2,svxz4,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
+ vpmuludq hc3,svxz3,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
+ vpmuludq hc4,svxz2,t2
+ vpaddq t2,t1,t1
+ # d1 = t1[0] + t1[1] + t1[3] + t1[4]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d1
+
+ # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
+ vpmuludq hc0,ruwy2,t1
+ # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
+ vpmuludq hc1,ruwy1,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
+ vpmuludq hc2,ruwy0,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
+ vpmuludq hc3,svxz4,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
+ vpmuludq hc4,svxz3,t2
+ vpaddq t2,t1,t1
+ # d2 = t1[0] + t1[1] + t1[2] + t1[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d2
+
+ # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
+ vpmuludq hc0,ruwy3,t1
+ # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
+ vpmuludq hc1,ruwy2,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
+ vpmuludq hc2,ruwy1,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
+ vpmuludq hc3,ruwy0,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
+ vpmuludq hc4,svxz4,t2
+ vpaddq t2,t1,t1
+ # d3 = t1[0] + t1[1] + t1[2] + t1[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d3
+
+ # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
+ vpmuludq hc0,ruwy4,t1
+ # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
+ vpmuludq hc1,ruwy3,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
+ vpmuludq hc2,ruwy2,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
+ vpmuludq hc3,ruwy1,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
+ vpmuludq hc4,ruwy0,t2
+ vpaddq t2,t1,t1
+ # d4 = t1[0] + t1[1] + t1[2] + t1[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d4
+
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
+ # d1 += d0 >> 26
+ mov d0,%rax
+ shr $26,%rax
+ add %rax,d1
+ # h0 = d0 & 0x3ffffff
+ mov d0,%rbx
+ and $0x3ffffff,%ebx
+
+ # d2 += d1 >> 26
+ mov d1,%rax
+ shr $26,%rax
+ add %rax,d2
+ # h1 = d1 & 0x3ffffff
+ mov d1,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h1
+
+ # d3 += d2 >> 26
+ mov d2,%rax
+ shr $26,%rax
+ add %rax,d3
+ # h2 = d2 & 0x3ffffff
+ mov d2,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h2
+
+ # d4 += d3 >> 26
+ mov d3,%rax
+ shr $26,%rax
+ add %rax,d4
+ # h3 = d3 & 0x3ffffff
+ mov d3,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h3
+
+ # h0 += (d4 >> 26) * 5
+ mov d4,%rax
+ shr $26,%rax
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
+ # h4 = d4 & 0x3ffffff
+ mov d4,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h4
+
+ # h1 += h0 >> 26
+ mov %rbx,%rax
+ shr $26,%rax
+ add %eax,h1
+ # h0 = h0 & 0x3ffffff
+ andl $0x3ffffff,%ebx
+ mov %ebx,h0
+
+ add $0x40,m
+ dec %rcx
+ jnz .Ldoblock4
+
+ vzeroupper
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+ENDPROC(poly1305_4block_avx2)
diff --git a/marvell/linux/arch/x86/crypto/poly1305-sse2-x86_64.S b/marvell/linux/arch/x86/crypto/poly1305-sse2-x86_64.S
new file mode 100644
index 0000000..5578f84
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.ANMASK, "aM", @progbits, 16
+.align 16
+ANMASK: .octa 0x0000000003ffffff0000000003ffffff
+
+.section .rodata.cst16.ORMASK, "aM", @progbits, 16
+.align 16
+ORMASK: .octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define s1 0x00(%rsp)
+#define s2 0x04(%rsp)
+#define s3 0x08(%rsp)
+#define s4 0x0c(%rsp)
+#define m %rsi
+#define h01 %xmm0
+#define h23 %xmm1
+#define h44 %xmm2
+#define t1 %xmm3
+#define t2 %xmm4
+#define t3 %xmm5
+#define t4 %xmm6
+#define mask %xmm7
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define d4 %r12
+
+ENTRY(poly1305_block_sse2)
+ # %rdi: Accumulator h[5]
+ # %rsi: 16 byte input block m
+ # %rdx: Poly1305 key r[5]
+ # %rcx: Block count
+
+ # This single block variant tries to improve performance by doing two
+ # multiplications in parallel using SSE instructions. There is quite
+ # some quardword packing involved, hence the speedup is marginal.
+
+ push %rbx
+ push %r12
+ sub $0x10,%rsp
+
+ # s1..s4 = r1..r4 * 5
+ mov r1,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s1
+ mov r2,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s2
+ mov r3,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s3
+ mov r4,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s4
+
+ movdqa ANMASK(%rip),mask
+
+.Ldoblock:
+ # h01 = [0, h1, 0, h0]
+ # h23 = [0, h3, 0, h2]
+ # h44 = [0, h4, 0, h4]
+ movd h0,h01
+ movd h1,t1
+ movd h2,h23
+ movd h3,t2
+ movd h4,h44
+ punpcklqdq t1,h01
+ punpcklqdq t2,h23
+ punpcklqdq h44,h44
+
+ # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+ movd 0x00(m),t1
+ movd 0x03(m),t2
+ psrld $2,t2
+ punpcklqdq t2,t1
+ pand mask,t1
+ paddd t1,h01
+ # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+ movd 0x06(m),t1
+ movd 0x09(m),t2
+ psrld $4,t1
+ psrld $6,t2
+ punpcklqdq t2,t1
+ pand mask,t1
+ paddd t1,h23
+ # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+ mov 0x0c(m),%eax
+ shr $8,%eax
+ or $0x01000000,%eax
+ movd %eax,t1
+ pshufd $0xc4,t1,t1
+ paddd t1,h44
+
+ # t1[0] = h0 * r0 + h2 * s3
+ # t1[1] = h1 * s4 + h3 * s2
+ movd r0,t1
+ movd s4,t2
+ punpcklqdq t2,t1
+ pmuludq h01,t1
+ movd s3,t2
+ movd s2,t3
+ punpcklqdq t3,t2
+ pmuludq h23,t2
+ paddq t2,t1
+ # t2[0] = h0 * r1 + h2 * s4
+ # t2[1] = h1 * r0 + h3 * s3
+ movd r1,t2
+ movd r0,t3
+ punpcklqdq t3,t2
+ pmuludq h01,t2
+ movd s4,t3
+ movd s3,t4
+ punpcklqdq t4,t3
+ pmuludq h23,t3
+ paddq t3,t2
+ # t3[0] = h4 * s1
+ # t3[1] = h4 * s2
+ movd s1,t3
+ movd s2,t4
+ punpcklqdq t4,t3
+ pmuludq h44,t3
+ # d0 = t1[0] + t1[1] + t3[0]
+ # d1 = t2[0] + t2[1] + t3[1]
+ movdqa t1,t4
+ punpcklqdq t2,t4
+ punpckhqdq t2,t1
+ paddq t4,t1
+ paddq t3,t1
+ movq t1,d0
+ psrldq $8,t1
+ movq t1,d1
+
+ # t1[0] = h0 * r2 + h2 * r0
+ # t1[1] = h1 * r1 + h3 * s4
+ movd r2,t1
+ movd r1,t2
+ punpcklqdq t2,t1
+ pmuludq h01,t1
+ movd r0,t2
+ movd s4,t3
+ punpcklqdq t3,t2
+ pmuludq h23,t2
+ paddq t2,t1
+ # t2[0] = h0 * r3 + h2 * r1
+ # t2[1] = h1 * r2 + h3 * r0
+ movd r3,t2
+ movd r2,t3
+ punpcklqdq t3,t2
+ pmuludq h01,t2
+ movd r1,t3
+ movd r0,t4
+ punpcklqdq t4,t3
+ pmuludq h23,t3
+ paddq t3,t2
+ # t3[0] = h4 * s3
+ # t3[1] = h4 * s4
+ movd s3,t3
+ movd s4,t4
+ punpcklqdq t4,t3
+ pmuludq h44,t3
+ # d2 = t1[0] + t1[1] + t3[0]
+ # d3 = t2[0] + t2[1] + t3[1]
+ movdqa t1,t4
+ punpcklqdq t2,t4
+ punpckhqdq t2,t1
+ paddq t4,t1
+ paddq t3,t1
+ movq t1,d2
+ psrldq $8,t1
+ movq t1,d3
+
+ # t1[0] = h0 * r4 + h2 * r2
+ # t1[1] = h1 * r3 + h3 * r1
+ movd r4,t1
+ movd r3,t2
+ punpcklqdq t2,t1
+ pmuludq h01,t1
+ movd r2,t2
+ movd r1,t3
+ punpcklqdq t3,t2
+ pmuludq h23,t2
+ paddq t2,t1
+ # t3[0] = h4 * r0
+ movd r0,t3
+ pmuludq h44,t3
+ # d4 = t1[0] + t1[1] + t3[0]
+ movdqa t1,t4
+ psrldq $8,t4
+ paddq t4,t1
+ paddq t3,t1
+ movq t1,d4
+
+ # d1 += d0 >> 26
+ mov d0,%rax
+ shr $26,%rax
+ add %rax,d1
+ # h0 = d0 & 0x3ffffff
+ mov d0,%rbx
+ and $0x3ffffff,%ebx
+
+ # d2 += d1 >> 26
+ mov d1,%rax
+ shr $26,%rax
+ add %rax,d2
+ # h1 = d1 & 0x3ffffff
+ mov d1,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h1
+
+ # d3 += d2 >> 26
+ mov d2,%rax
+ shr $26,%rax
+ add %rax,d3
+ # h2 = d2 & 0x3ffffff
+ mov d2,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h2
+
+ # d4 += d3 >> 26
+ mov d3,%rax
+ shr $26,%rax
+ add %rax,d4
+ # h3 = d3 & 0x3ffffff
+ mov d3,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h3
+
+ # h0 += (d4 >> 26) * 5
+ mov d4,%rax
+ shr $26,%rax
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
+ # h4 = d4 & 0x3ffffff
+ mov d4,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h4
+
+ # h1 += h0 >> 26
+ mov %rbx,%rax
+ shr $26,%rax
+ add %eax,h1
+ # h0 = h0 & 0x3ffffff
+ andl $0x3ffffff,%ebx
+ mov %ebx,h0
+
+ add $0x10,m
+ dec %rcx
+ jnz .Ldoblock
+
+ # Zeroing of key material
+ mov %rcx,0x00(%rsp)
+ mov %rcx,0x08(%rsp)
+
+ add $0x10,%rsp
+ pop %r12
+ pop %rbx
+ ret
+ENDPROC(poly1305_block_sse2)
+
+
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define hc0 %xmm0
+#define hc1 %xmm1
+#define hc2 %xmm2
+#define hc3 %xmm5
+#define hc4 %xmm6
+#define ru0 %xmm7
+#define ru1 %xmm8
+#define ru2 %xmm9
+#define ru3 %xmm10
+#define ru4 %xmm11
+#define sv1 %xmm12
+#define sv2 %xmm13
+#define sv3 %xmm14
+#define sv4 %xmm15
+#undef d0
+#define d0 %r13
+
+ENTRY(poly1305_2block_sse2)
+ # %rdi: Accumulator h[5]
+ # %rsi: 16 byte input block m
+ # %rdx: Poly1305 key r[5]
+ # %rcx: Doubleblock count
+ # %r8: Poly1305 derived key r^2 u[5]
+
+ # This two-block variant further improves performance by using loop
+ # unrolled block processing. This is more straight forward and does
+ # less byte shuffling, but requires a second Poly1305 key r^2:
+ # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
+
+ push %rbx
+ push %r12
+ push %r13
+
+ # combine r0,u0
+ movd u0,ru0
+ movd r0,t1
+ punpcklqdq t1,ru0
+
+ # combine r1,u1 and s1=r1*5,v1=u1*5
+ movd u1,ru1
+ movd r1,t1
+ punpcklqdq t1,ru1
+ movdqa ru1,sv1
+ pslld $2,sv1
+ paddd ru1,sv1
+
+ # combine r2,u2 and s2=r2*5,v2=u2*5
+ movd u2,ru2
+ movd r2,t1
+ punpcklqdq t1,ru2
+ movdqa ru2,sv2
+ pslld $2,sv2
+ paddd ru2,sv2
+
+ # combine r3,u3 and s3=r3*5,v3=u3*5
+ movd u3,ru3
+ movd r3,t1
+ punpcklqdq t1,ru3
+ movdqa ru3,sv3
+ pslld $2,sv3
+ paddd ru3,sv3
+
+ # combine r4,u4 and s4=r4*5,v4=u4*5
+ movd u4,ru4
+ movd r4,t1
+ punpcklqdq t1,ru4
+ movdqa ru4,sv4
+ pslld $2,sv4
+ paddd ru4,sv4
+
+.Ldoblock2:
+ # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
+ movd 0x00(m),hc0
+ movd 0x10(m),t1
+ punpcklqdq t1,hc0
+ pand ANMASK(%rip),hc0
+ movd h0,t1
+ paddd t1,hc0
+ # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
+ movd 0x03(m),hc1
+ movd 0x13(m),t1
+ punpcklqdq t1,hc1
+ psrld $2,hc1
+ pand ANMASK(%rip),hc1
+ movd h1,t1
+ paddd t1,hc1
+ # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
+ movd 0x06(m),hc2
+ movd 0x16(m),t1
+ punpcklqdq t1,hc2
+ psrld $4,hc2
+ pand ANMASK(%rip),hc2
+ movd h2,t1
+ paddd t1,hc2
+ # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
+ movd 0x09(m),hc3
+ movd 0x19(m),t1
+ punpcklqdq t1,hc3
+ psrld $6,hc3
+ pand ANMASK(%rip),hc3
+ movd h3,t1
+ paddd t1,hc3
+ # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
+ movd 0x0c(m),hc4
+ movd 0x1c(m),t1
+ punpcklqdq t1,hc4
+ psrld $8,hc4
+ por ORMASK(%rip),hc4
+ movd h4,t1
+ paddd t1,hc4
+
+ # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
+ movdqa ru0,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
+ movdqa sv3,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
+ movdqa sv2,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
+ movdqa sv1,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d0 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d0
+
+ # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
+ movdqa ru1,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
+ movdqa sv3,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
+ movdqa sv2,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d1 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d1
+
+ # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
+ movdqa ru2,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
+ movdqa ru1,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
+ movdqa sv3,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d2 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d2
+
+ # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
+ movdqa ru3,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
+ movdqa ru2,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
+ movdqa ru1,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d3 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d3
+
+ # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
+ movdqa ru4,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
+ movdqa ru3,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
+ movdqa ru2,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
+ movdqa ru1,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d4 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d4
+
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
+ # d1 += d0 >> 26
+ mov d0,%rax
+ shr $26,%rax
+ add %rax,d1
+ # h0 = d0 & 0x3ffffff
+ mov d0,%rbx
+ and $0x3ffffff,%ebx
+
+ # d2 += d1 >> 26
+ mov d1,%rax
+ shr $26,%rax
+ add %rax,d2
+ # h1 = d1 & 0x3ffffff
+ mov d1,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h1
+
+ # d3 += d2 >> 26
+ mov d2,%rax
+ shr $26,%rax
+ add %rax,d3
+ # h2 = d2 & 0x3ffffff
+ mov d2,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h2
+
+ # d4 += d3 >> 26
+ mov d3,%rax
+ shr $26,%rax
+ add %rax,d4
+ # h3 = d3 & 0x3ffffff
+ mov d3,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h3
+
+ # h0 += (d4 >> 26) * 5
+ mov d4,%rax
+ shr $26,%rax
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
+ # h4 = d4 & 0x3ffffff
+ mov d4,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h4
+
+ # h1 += h0 >> 26
+ mov %rbx,%rax
+ shr $26,%rax
+ add %eax,h1
+ # h0 = h0 & 0x3ffffff
+ andl $0x3ffffff,%ebx
+ mov %ebx,h0
+
+ add $0x20,m
+ dec %rcx
+ jnz .Ldoblock2
+
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+ENDPROC(poly1305_2block_sse2)
diff --git a/marvell/linux/arch/x86/crypto/poly1305_glue.c b/marvell/linux/arch/x86/crypto/poly1305_glue.c
new file mode 100644
index 0000000..4a1c05d
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/poly1305_glue.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/poly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+struct poly1305_simd_desc_ctx {
+ struct poly1305_desc_ctx base;
+ /* derived key u set? */
+ bool uset;
+#ifdef CONFIG_AS_AVX2
+ /* derived keys r^3, r^4 set? */
+ bool wset;
+#endif
+ /* derived Poly1305 key r^2 */
+ u32 u[5];
+ /* ... silently appended r^3 and r^4 when using AVX2 */
+};
+
+asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+ const u32 *r, unsigned int blocks);
+asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+ unsigned int blocks, const u32 *u);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+ unsigned int blocks, const u32 *u);
+static bool poly1305_use_avx2;
+#endif
+
+static int poly1305_simd_init(struct shash_desc *desc)
+{
+ struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
+
+ sctx->uset = false;
+#ifdef CONFIG_AS_AVX2
+ sctx->wset = false;
+#endif
+
+ return crypto_poly1305_init(desc);
+}
+
+static void poly1305_simd_mult(u32 *a, const u32 *b)
+{
+ u8 m[POLY1305_BLOCK_SIZE];
+
+ memset(m, 0, sizeof(m));
+ /* The poly1305 block function adds a hi-bit to the accumulator which
+ * we don't need for key multiplication; compensate for it. */
+ a[4] -= 1 << 24;
+ poly1305_block_sse2(a, m, b, 1);
+}
+
+static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_simd_desc_ctx *sctx;
+ unsigned int blocks, datalen;
+
+ BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
+ sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
+
+ if (unlikely(!dctx->sset)) {
+ datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+ src += srclen - datalen;
+ srclen = datalen;
+ }
+
+#ifdef CONFIG_AS_AVX2
+ if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
+ if (unlikely(!sctx->wset)) {
+ if (!sctx->uset) {
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
+ sctx->uset = true;
+ }
+ memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u + 5, dctx->r.r);
+ memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u + 10, dctx->r.r);
+ sctx->wset = true;
+ }
+ blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+ poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
+ src += POLY1305_BLOCK_SIZE * 4 * blocks;
+ srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+ }
+#endif
+ if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+ if (unlikely(!sctx->uset)) {
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
+ sctx->uset = true;
+ }
+ blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+ poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
+ src += POLY1305_BLOCK_SIZE * 2 * blocks;
+ srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+ }
+ if (srclen >= POLY1305_BLOCK_SIZE) {
+ poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
+ srclen -= POLY1305_BLOCK_SIZE;
+ }
+ return srclen;
+}
+
+static int poly1305_simd_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ unsigned int bytes;
+
+ /* kernel_fpu_begin/end is costly, use fallback for small updates */
+ if (srclen <= 288 || !crypto_simd_usable())
+ return crypto_poly1305_update(desc, src, srclen);
+
+ kernel_fpu_begin();
+
+ if (unlikely(dctx->buflen)) {
+ bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ srclen -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_simd_blocks(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+ bytes = poly1305_simd_blocks(dctx, src, srclen);
+ src += srclen - bytes;
+ srclen = bytes;
+ }
+
+ kernel_fpu_end();
+
+ if (unlikely(srclen)) {
+ dctx->buflen = srclen;
+ memcpy(dctx->buf, src, srclen);
+ }
+
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = poly1305_simd_init,
+ .update = poly1305_simd_update,
+ .final = crypto_poly1305_final,
+ .descsize = sizeof(struct poly1305_simd_desc_ctx),
+ .base = {
+ .cra_name = "poly1305",
+ .cra_driver_name = "poly1305-simd",
+ .cra_priority = 300,
+ .cra_blocksize = POLY1305_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int __init poly1305_simd_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2))
+ return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+ poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
+ if (poly1305_use_avx2)
+ alg.descsize += 10 * sizeof(u32);
+#endif
+ return crypto_register_shash(&alg);
+}
+
+static void __exit poly1305_simd_mod_exit(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(poly1305_simd_mod_init);
+module_exit(poly1305_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("Poly1305 authenticator");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/marvell/linux/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 0000000..ddc51db
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,781 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "serpent-avx-x86_64-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way AVX serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define tp %xmm5
+
+#define RA2 %xmm6
+#define RB2 %xmm7
+#define RC2 %xmm8
+#define RD2 %xmm9
+#define RE2 %xmm10
+
+#define RNOT %xmm11
+
+#define RK0 %xmm12
+#define RK1 %xmm13
+#define RK2 %xmm14
+#define RK3 %xmm15
+
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i, j, t) \
+ vbroadcastss (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+.align 8
+__serpent_enc_blk8_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_enc_blk8_avx)
+
+.align 8
+__serpent_dec_blk8_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_dec_blk8_avx)
+
+ENTRY(serpent_ecb_enc_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_enc_8way_avx)
+
+ENTRY(serpent_ecb_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_dec_8way_avx)
+
+ENTRY(serpent_cbc_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_cbc_dec_8way_avx)
+
+ENTRY(serpent_ctr_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK1, RK2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ctr_8way_avx)
+
+ENTRY(serpent_xts_enc_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+ call __serpent_enc_blk8_avx;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_enc_8way_avx)
+
+ENTRY(serpent_xts_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+ call __serpent_dec_blk8_avx;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_dec_8way_avx)
diff --git a/marvell/linux/arch/x86/crypto/serpent-avx2-asm_64.S b/marvell/linux/arch/x86/crypto/serpent-avx2-asm_64.S
new file mode 100644
index 0000000..37bc1d4
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,813 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on AVX assembler implementation of Serpent by:
+ * Copyright © 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "serpent-avx2-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask_0:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask_1:
+ .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+#define CTX %rdi
+
+#define RNOT %ymm0
+#define tp %ymm1
+
+#define RA1 %ymm2
+#define RA2 %ymm3
+#define RB1 %ymm4
+#define RB2 %ymm5
+#define RC1 %ymm6
+#define RC2 %ymm7
+#define RD1 %ymm8
+#define RD2 %ymm9
+#define RE1 %ymm10
+#define RE2 %ymm11
+
+#define RK0 %ymm12
+#define RK1 %ymm13
+#define RK2 %ymm14
+#define RK3 %ymm15
+
+#define RK0x %xmm12
+#define RK1x %xmm13
+#define RK2x %xmm14
+#define RK3x %xmm15
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i,j,t) \
+ vpbroadcastd (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+.align 8
+__serpent_enc_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_enc_blk16)
+
+.align 8
+__serpent_dec_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_dec_blk16)
+
+ENTRY(serpent_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk16;
+
+ store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_enc_16way)
+
+ENTRY(serpent_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk16;
+
+ store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_dec_16way)
+
+ENTRY(serpent_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk16;
+
+ store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
+ RK0);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_cbc_dec_16way)
+
+ENTRY(serpent_ctr_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+ tp);
+
+ call __serpent_enc_blk16;
+
+ store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ctr_16way)
+
+ENTRY(serpent_xts_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+ .Lxts_gf128mul_and_shl1_mask_0,
+ .Lxts_gf128mul_and_shl1_mask_1);
+
+ call __serpent_enc_blk16;
+
+ store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_enc_16way)
+
+ENTRY(serpent_xts_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+ .Lxts_gf128mul_and_shl1_mask_0,
+ .Lxts_gf128mul_and_shl1_mask_1);
+
+ call __serpent_dec_blk16;
+
+ store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_dec_16way)
diff --git a/marvell/linux/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/marvell/linux/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 0000000..e5c4a46
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ */
+
+#include <linux/linkage.h>
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+ 4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, x4); \
+ get_key(i, 1, RT0); \
+ get_key(i, 2, RT1); \
+ pxor x4, x0; \
+ pxor RT0, x1; \
+ pxor RT1, x2; \
+ get_key(i, 3, x4); \
+ pxor x4, x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+ movdqa x0, x4; \
+ pslld $13, x0; \
+ psrld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x0, x1; \
+ movdqa x2, x4; \
+ pslld $3, x2; \
+ psrld $(32 - 3), x4; \
+ por x4, x2; \
+ pxor x2, x1; \
+ movdqa x1, x4; \
+ pslld $1, x1; \
+ psrld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x2, x3; \
+ pxor x4, x3; \
+ movdqa x3, x4; \
+ pslld $7, x3; \
+ psrld $(32 - 7), x4; \
+ por x4, x3; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x3, x0; \
+ pxor x3, x2; \
+ pxor x4, x2; \
+ movdqa x0, x4; \
+ get_key(i, 1, RT0); \
+ pxor RT0, x1; \
+ get_key(i, 3, RT0); \
+ pxor RT0, x3; \
+ pslld $5, x0; \
+ psrld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ pslld $22, x2; \
+ psrld $(32 - 22), x4; \
+ por x4, x2; \
+ get_key(i, 0, RT0); \
+ pxor RT0, x0; \
+ get_key(i, 2, RT0); \
+ pxor RT0, x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+ K(x0, x1, x2, x3, x4, i); \
+ movdqa x0, x4; \
+ psrld $5, x0; \
+ pslld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ psrld $22, x2; \
+ pslld $(32 - 22), x4; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pxor x3, x0; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x4, x2; \
+ movdqa x1, x4; \
+ psrld $1, x1; \
+ pslld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x3, x4; \
+ psrld $7, x3; \
+ pslld $(32 - 7), x4; \
+ por x4, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x4, x3; \
+ movdqa x0, x4; \
+ psrld $13, x0; \
+ pslld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x2, x1; \
+ pxor x2, x3; \
+ movdqa x2, x4; \
+ psrld $3, x2; \
+ pslld $(32 - 3), x4; \
+ por x4, x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2; \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1; \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4; \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1; \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0; \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2; \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2; \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1; \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3; \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0; \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ movdqa x0, t2; \
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+ENTRY(__serpent_enc_blk_4way)
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ * arg_xor(%esp): bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 0);
+ S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
+ S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
+ S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
+ S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
+ S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
+ S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
+ S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
+ S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
+ S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
+ S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
+ S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
+ S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
+ S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
+ S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
+ S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
+ S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
+ S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
+ S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
+ S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
+ S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
+ S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
+ S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
+ S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
+ S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
+ S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
+ S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
+ S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
+ S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
+ S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
+ S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
+ S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
+ S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
+
+ movl arg_dst(%esp), %eax;
+
+ cmpb $0, arg_xor(%esp);
+ jnz .L__enc_xor4;
+
+ write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+
+.L__enc_xor4:
+ xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+ENDPROC(__serpent_enc_blk_4way)
+
+ENTRY(serpent_dec_blk_4way)
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 32);
+ SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
+ SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
+ SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
+ SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
+ SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
+ SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
+ SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
+ SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
+ SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
+ SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
+ SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
+ SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
+ SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
+ SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
+ SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
+ SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
+ SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
+ SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
+ SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
+ SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
+ SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
+ SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
+ SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
+ SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
+ SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
+ SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
+ SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
+ SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
+ SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
+ SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
+ SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
+ SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
+
+ movl arg_dst(%esp), %eax;
+ write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+ ret;
+ENDPROC(serpent_dec_blk_4way)
diff --git a/marvell/linux/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 0000000..5e0b3a3
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,739 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ */
+
+#include <linux/linkage.h>
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ pxor RK0, x0 ## 1; \
+ pxor RK1, x1 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK1, x1 ## 2; \
+ pxor RK2, x2 ## 2; \
+ pxor RK3, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $13, x0 ## 1; \
+ psrld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $3, x2 ## 1; \
+ psrld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $13, x0 ## 2; \
+ psrld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $3, x2 ## 2; \
+ psrld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $1, x1 ## 1; \
+ psrld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x3 ## 1, x4 ## 1; \
+ get_key(i, 1, RK1); \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $1, x1 ## 2; \
+ psrld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x3 ## 2, x4 ## 2; \
+ get_key(i, 3, RK3); \
+ pslld $7, x3 ## 1; \
+ psrld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ pslld $7, x3 ## 2; \
+ psrld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ pxor RK1, x1 ## 1; \
+ pxor RK3, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $5, x0 ## 1; \
+ psrld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $22, x2 ## 1; \
+ psrld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK1, x1 ## 2; \
+ pxor RK3, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $5, x0 ## 2; \
+ psrld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $22, x2 ## 2; \
+ psrld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ psrld $5, x0 ## 1; \
+ pslld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $22, x2 ## 1; \
+ pslld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $5, x0 ## 2; \
+ pslld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor RK3, x3 ## 2; \
+ pxor RK1, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $22, x2 ## 2; \
+ pslld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x3 ## 1, x0 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ psrld $1, x1 ## 1; \
+ pslld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ pxor x3 ## 2, x0 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ psrld $1, x1 ## 2; \
+ pslld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x3 ## 1, x4 ## 1; \
+ psrld $7, x3 ## 1; \
+ pslld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ movdqa x3 ## 2, x4 ## 2; \
+ psrld $7, x3 ## 2; \
+ pslld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $13, x0 ## 1; \
+ pslld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $3, x2 ## 1; \
+ pslld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ psrld $13, x0 ## 2; \
+ pslld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $3, x2 ## 2; \
+ pslld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 3, RK3); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ movdqa x0, t2; \
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+ENTRY(__serpent_enc_blk_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ leaq (4*4*4)(%rsi), %rax;
+
+ testb %cl, %cl;
+ jnz .L__enc_xor8;
+
+ write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+.L__enc_xor8:
+ xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_enc_blk_8way)
+
+ENTRY(serpent_dec_blk_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ leaq (4*4*4)(%rsi), %rax;
+ write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(serpent_dec_blk_8way)
diff --git a/marvell/linux/arch/x86/crypto/serpent_avx2_glue.c b/marvell/linux/arch/x86/crypto/serpent_avx2_glue.c
new file mode 100644
index 0000000..f973ace
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/serpent.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/serpent-avx.h>
+
+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .ecb = serpent_ecb_enc_16way }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __serpent_encrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .ctr = serpent_ctr_16way }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .ctr = serpent_ctr_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = __serpent_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .xts = serpent_xts_enc_16way }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .xts = serpent_xts_enc_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = serpent_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .ecb = serpent_ecb_dec_16way }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __serpent_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .cbc = serpent_cbc_dec_16way }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = __serpent_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .xts = serpent_xts_dec_16way }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .xts = serpent_xts_dec_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = serpent_xts_dec }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&serpent_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_enc_xts, req,
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_dec_xts, req,
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, true);
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "__ecb(serpent)",
+ .base.cra_driver_name = "__ecb-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(serpent)",
+ .base.cra_driver_name = "__cbc-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(serpent)",
+ .base.cra_driver_name = "__ctr-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .chunksize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(serpent)",
+ .base.cra_driver_name = "__xts-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
+ .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = xts_serpent_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
+
+static int __init init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(serpent_algs,
+ ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+static void __exit fini(void)
+{
+ simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS_CRYPTO("serpent");
+MODULE_ALIAS_CRYPTO("serpent-asm");
diff --git a/marvell/linux/arch/x86/crypto/serpent_avx_glue.c b/marvell/linux/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 0000000..7806d1c
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for AVX assembler versions of Serpent Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/serpent.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/serpent-avx.h>
+
+/* 8-way parallel cipher functions */
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
+
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
+
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
+
+asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
+
+asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
+
+asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
+
+void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
+
+void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt);
+}
+EXPORT_SYMBOL_GPL(serpent_xts_enc);
+
+void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt);
+}
+EXPORT_SYMBOL_GPL(serpent_xts_dec);
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
+}
+EXPORT_SYMBOL_GPL(xts_serpent_setkey);
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __serpent_encrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = serpent_ctr_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = __serpent_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .xts = serpent_xts_enc_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = serpent_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __serpent_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = __serpent_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .xts = serpent_xts_dec_8way_avx }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = serpent_xts_dec }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&serpent_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_enc_xts, req,
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_dec_xts, req,
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, true);
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "__ecb(serpent)",
+ .base.cra_driver_name = "__ecb-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(serpent)",
+ .base.cra_driver_name = "__cbc-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(serpent)",
+ .base.cra_driver_name = "__ctr-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .chunksize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(serpent)",
+ .base.cra_driver_name = "__xts-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
+ .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = xts_serpent_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
+
+static int __init serpent_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(serpent_algs,
+ ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+static void __exit serpent_exit(void)
+{
+ simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+module_init(serpent_init);
+module_exit(serpent_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/marvell/linux/arch/x86/crypto/serpent_sse2_glue.c b/marvell/linux/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 0000000..4fed8d2
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/internal/simd.h>
+#include <crypto/serpent.h>
+#include <asm/crypto/serpent-sse2.h>
+#include <asm/crypto/glue_helper.h>
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s)
+{
+ u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+ unsigned int j;
+
+ for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+ ivs[j] = src[j];
+
+ serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+ for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+ u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+}
+
+static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+
+static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s,
+ le128 *iv)
+{
+ be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+ unsigned int i;
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+ if (dst != src)
+ dst[i] = src[i];
+
+ le128_to_be128(&ctrblks[i], iv);
+ le128_inc(iv);
+ }
+
+ serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = serpent_enc_blk_xway }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __serpent_encrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = serpent_crypt_ctr_xway }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = serpent_crypt_ctr }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = serpent_dec_blk_xway }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = __serpent_decrypt }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = serpent_decrypt_cbc_xway }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = __serpent_decrypt }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(__serpent_encrypt,
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&serpent_ctr, req);
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "__ecb(serpent)",
+ .base.cra_driver_name = "__ecb-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(serpent)",
+ .base.cra_driver_name = "__cbc-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(serpent)",
+ .base.cra_driver_name = "__ctr-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .chunksize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+};
+
+static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
+
+static int __init serpent_sse2_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2)) {
+ printk(KERN_INFO "SSE2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(serpent_algs,
+ ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+ simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/marvell/linux/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/marvell/linux/arch/x86/crypto/sha1_avx2_x86_64_asm.S
new file mode 100644
index 0000000..9f712a7
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -0,0 +1,711 @@
+/*
+ * Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ *Updates 20-byte SHA-1 record in 'hash' for even number of
+ *'num_blocks' consecutive 64-byte blocks
+ *
+ *extern "C" void sha1_transform_avx2(
+ * int *hash, const char* input, size_t num_blocks );
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi /* arg1 */
+#define BUF %rsi /* arg2 */
+#define CNT %rdx /* arg3 */
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %eax
+#define REG_E %edx
+#define REG_TB %ebx
+#define REG_TA %r12d
+#define REG_RA %rcx
+#define REG_RB %rsi
+#define REG_RC %rdi
+#define REG_RD %rax
+#define REG_RE %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1 %r11d
+#define xmm_mov vmovups
+#define avx2_zeroupper vzeroupper
+#define RND_F1 1
+#define RND_F2 2
+#define RND_F3 3
+
+.macro REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set TB, REG_TB
+ .set TA, REG_TA
+
+ .set RA, REG_RA
+ .set RB, REG_RB
+ .set RC, REG_RC
+ .set RD, REG_RD
+ .set RE, REG_RE
+
+ .set RTA, REG_RTA
+ .set RTB, REG_RTB
+
+ .set T1, REG_T1
+.endm
+
+#define HASH_PTR %r9
+#define BLOCKS_CTR %r8
+#define BUFFER_PTR %r10
+#define BUFFER_PTR2 %r13
+
+#define PRECALC_BUF %r14
+#define WK_BUF %r15
+
+#define W_TMP %xmm0
+#define WY_TMP %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0 %ymm3
+#define WY4 %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP %ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ * - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+ .set WY_00, WY0
+ .set WY_04, WY4
+ .set WY_08, WY08
+ .set WY_12, WY12
+ .set WY_16, WY16
+ .set WY_20, WY20
+ .set WY_24, WY24
+ .set WY_28, WY28
+ .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+ /* Rotate macros */
+ .set WY_32, WY_28
+ .set WY_28, WY_24
+ .set WY_24, WY_20
+ .set WY_20, WY_16
+ .set WY_16, WY_12
+ .set WY_12, WY_08
+ .set WY_08, WY_04
+ .set WY_04, WY_00
+ .set WY_00, WY_32
+
+ /* Define register aliases */
+ .set WY, WY_00
+ .set WY_minus_04, WY_04
+ .set WY_minus_08, WY_08
+ .set WY_minus_12, WY_12
+ .set WY_minus_16, WY_16
+ .set WY_minus_20, WY_20
+ .set WY_minus_24, WY_24
+ .set WY_minus_28, WY_28
+ .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+ .if (i == 0) # Initialize and rotate registers
+ PRECALC_RESET_WY
+ PRECALC_ROTATE_WY
+ .endif
+
+ /* message scheduling pre-compute for rounds 0-15 */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+ .elseif ((i & 7) == 1)
+ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+ WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+ .elseif ((i & 7) == 4)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ .elseif ((i & 7) == 7)
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_16_31
+ /*
+ * message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction
+ *
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency
+ */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ /* w[i-14] */
+ vpalignr $8, WY_minus_16, WY_minus_12, WY
+ vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_08, WY, WY
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpxor WY_TMP, WY, WY
+ vpslldq $12, WY, WY_TMP2
+ .elseif ((i & 7) == 3)
+ vpslld $1, WY, WY_TMP
+ vpsrld $31, WY, WY
+ .elseif ((i & 7) == 4)
+ vpor WY, WY_TMP, WY_TMP
+ vpslld $2, WY_TMP2, WY
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY_TMP2, WY_TMP2
+ vpxor WY, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 7)
+ vpxor WY_TMP2, WY_TMP, WY
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_32_79
+ /*
+ * in SHA-1 specification:
+ * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:
+ * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization
+ * since w[i]=>w[i-3] dependency is broken
+ */
+
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+ .elseif ((i & 7) == 1)
+ /* W is W_minus_32 before xor */
+ vpxor WY_minus_28, WY, WY
+ .elseif ((i & 7) == 2)
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 3)
+ vpxor WY_TMP, WY, WY
+ .elseif ((i & 7) == 4)
+ vpslld $2, WY, WY_TMP
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY, WY
+ vpor WY, WY_TMP, WY
+ .elseif ((i & 7) == 7)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC r, s
+ .set i, \r
+
+ .if (i < 40)
+ .set K_XMM, 32*0
+ .elseif (i < 80)
+ .set K_XMM, 32*1
+ .elseif (i < 120)
+ .set K_XMM, 32*2
+ .else
+ .set K_XMM, 32*3
+ .endif
+
+ .if (i<32)
+ PRECALC_00_15 \s
+ .elseif (i<64)
+ PRECALC_16_31 \s
+ .elseif (i < 160)
+ PRECALC_32_79 \s
+ .endif
+.endm
+
+.macro ROTATE_STATE
+ .set T_REG, E
+ .set E, D
+ .set D, C
+ .set C, B
+ .set B, TB
+ .set TB, A
+ .set A, T_REG
+
+ .set T_REG, RE
+ .set RE, RD
+ .set RD, RC
+ .set RC, RB
+ .set RB, RTB
+ .set RTB, RA
+ .set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+ .if (\f == RND_F1)
+ ROUND_F1 \r
+ .elseif (\f == RND_F2)
+ ROUND_F2 \r
+ .elseif (\f == RND_F3)
+ ROUND_F3 \r
+ .endif
+.endm
+
+.macro RR r
+ .set round_id, (\r % 80)
+
+ .if (round_id == 0) /* Precalculate F for first round */
+ .set ROUND_FUNC, RND_F1
+ mov B, TB
+
+ rorx $(32-30), B, B /* b>>>2 */
+ andn D, TB, T1
+ and C, TB
+ xor T1, TB
+ .endif
+
+ RND_FUN ROUND_FUNC, \r
+ ROTATE_STATE
+
+ .if (round_id == 18)
+ .set ROUND_FUNC, RND_F2
+ .elseif (round_id == 38)
+ .set ROUND_FUNC, RND_F3
+ .elseif (round_id == 58)
+ .set ROUND_FUNC, RND_F2
+ .endif
+
+ .set round_id, ( (\r+1) % 80)
+
+ RND_FUN ROUND_FUNC, (\r+1)
+ ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+ add WK(\r), E
+
+ andn C, A, T1 /* ~b&d */
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30),A, TB /* b>>>2 for next round */
+
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ /*
+ * Calculate F for the next round
+ * (b & c) ^ andn[b, d]
+ */
+ and B, A /* b&c */
+ xor T1, A /* F1 = (b&c) ^ (~b&d) */
+
+ lea (RE,RTA), E /* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+ add WK(\r), E
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ /* Calculate F for the next round */
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ .if ((round_id) < 79)
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+ .endif
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ .if ((round_id) < 79)
+ xor B, A
+ .endif
+
+ add TA, E /* E += A >>> 5 */
+
+ .if ((round_id) < 79)
+ xor C, A
+ .endif
+.endm
+
+.macro ROUND_F3 r
+ add WK(\r), E
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ mov B, T1
+ or A, T1
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+
+ /* Calculate F for the next round
+ * (b and c) or (d and (b or c))
+ */
+ and C, T1
+ and B, A
+ or T1, A
+
+ add TA, E /* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+ mov \a, RTA
+ add $\d, RTA
+ cmp $\c, \b
+ cmovge RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+ REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ mov %rsp, PRECALC_BUF
+ lea (2*4*80+32)(%rsp), WK_BUF
+
+ # Precalc WK for first 2 blocks
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+ .set i, 0
+ .rept 160
+ PRECALC i
+ .set i, i + 1
+ .endr
+
+ /* Go to next block if needed */
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+ xchg WK_BUF, PRECALC_BUF
+
+ .align 32
+_loop:
+ /*
+ * code loops through more than one block
+ * we use K_BASE value as a signal of a last block,
+ * it is set below by: cmovae BUFFER_PTR, K_BASE
+ */
+ test BLOCKS_CTR, BLOCKS_CTR
+ jnz _begin
+ .align 32
+ jmp _end
+ .align 32
+_begin:
+
+ /*
+ * Do first block
+ * rounds: 0,2,4,6,8
+ */
+ .set j, 0
+ .rept 5
+ RR j
+ .set j, j+2
+ .endr
+
+ jmp _loop0
+_loop0:
+
+ /*
+ * rounds:
+ * 10,12,14,16,18
+ * 20,22,24,26,28
+ * 30,32,34,36,38
+ * 40,42,44,46,48
+ * 50,52,54,56,58
+ */
+ .rept 25
+ RR j
+ .set j, j+2
+ .endr
+
+ /* Update Counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+ /*
+ * rounds
+ * 60,62,64,66,68
+ * 70,72,74,76,78
+ */
+ .rept 10
+ RR j
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ test BLOCKS_CTR, BLOCKS_CTR
+ jz _loop
+
+ mov TB, B
+
+ /* Process second block */
+ /*
+ * rounds
+ * 0+80, 2+80, 4+80, 6+80, 8+80
+ * 10+80,12+80,14+80,16+80,18+80
+ */
+
+ .set j, 0
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ jmp _loop1
+_loop1:
+ /*
+ * rounds
+ * 20+80,22+80,24+80,26+80,28+80
+ * 30+80,32+80,34+80,36+80,38+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ jmp _loop2
+_loop2:
+
+ /*
+ * rounds
+ * 40+80,42+80,44+80,46+80,48+80
+ * 50+80,52+80,54+80,56+80,58+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /* update counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+ jmp _loop3
+_loop3:
+
+ /*
+ * rounds
+ * 60+80,62+80,64+80,66+80,68+80
+ * 70+80,72+80,74+80,76+80,78+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ /* Reset state for AVX2 reg permutation */
+ mov A, TA
+ mov TB, A
+ mov C, TB
+ mov E, C
+ mov D, B
+ mov TA, D
+
+ REGALLOC
+
+ xchg WK_BUF, PRECALC_BUF
+
+ jmp _loop
+
+ .align 32
+ _end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ ENTRY(\name)
+
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ RESERVE_STACK = (W_SIZE*4 + 8+24)
+
+ /* Align stack */
+ mov %rsp, %rbx
+ and $~(0x20-1), %rsp
+ push %rbx
+ sub $RESERVE_STACK, %rsp
+
+ avx2_zeroupper
+
+ /* Setup initial values */
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ mov BUF, BUFFER_PTR2
+ mov CNT, BLOCKS_CTR
+
+ xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ avx2_zeroupper
+
+ add $RESERVE_STACK, %rsp
+ pop %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ ret
+
+ ENDPROC(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/marvell/linux/arch/x86/crypto/sha1_ni_asm.S b/marvell/linux/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 0000000..ebbdba7
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,304 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define RSPSAVE %rax
+
+/* gcc conversion */
+#define FRAME_SIZE 32 /* space for 2x16 bytes */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+ mov %rsp, RSPSAVE
+ sub $FRAME_SIZE, %rsp
+ and $~0xF, %rsp
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /* load initial hash values */
+ pinsrd $3, 1*16(DIGEST_PTR), E0
+ movdqu 0*16(DIGEST_PTR), ABCD
+ pand UPPER_WORD_MASK(%rip), E0
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa E0, (0*16)(%rsp)
+ movdqa ABCD, (1*16)(%rsp)
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG0
+ pshufb SHUF_MASK, MSG0
+ paddd MSG0, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG1
+ pshufb SHUF_MASK, MSG1
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG1, MSG0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG2
+ pshufb SHUF_MASK, MSG2
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG3
+ pshufb SHUF_MASK, MSG3
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 16-19 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 20-23 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 24-27 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 28-31 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 32-35 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 36-39 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 40-43 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 44-47 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 48-51 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 52-55 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 56-59 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 60-63 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $3, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 64-67 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $3, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 68-71 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $3, E1, ABCD
+ pxor MSG1, MSG3
+
+ /* Rounds 72-75 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $3, E0, ABCD
+
+ /* Rounds 76-79 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1rnds4 $3, E1, ABCD
+
+ /* Add current hash values with previously saved */
+ sha1nexte (0*16)(%rsp), E0
+ paddd (1*16)(%rsp), ABCD
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, 0*16(DIGEST_PTR)
+ pextrd $3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+ mov RSPSAVE, %rsp
+
+ ret
+ENDPROC(sha1_ni_transform)
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
+UPPER_WORD_MASK:
+ .octa 0xFFFFFFFF000000000000000000000000
diff --git a/marvell/linux/arch/x86/crypto/sha1_ssse3_asm.S b/marvell/linux/arch/x86/crypto/sha1_ssse3_asm.S
new file mode 100644
index 0000000..99c5b8c
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha1_ssse3_asm.S
@@ -0,0 +1,553 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ * http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi // arg1
+#define BUF %rsi // arg2
+#define CNT %rdx // arg3
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %r12d
+#define REG_E %edx
+
+#define REG_T1 %eax
+#define REG_T2 %ebx
+
+#define K_BASE %r8
+#define HASH_PTR %r9
+#define BUFFER_PTR %r10
+#define BUFFER_END %r11
+
+#define W_TMP1 %xmm0
+#define W_TMP2 %xmm9
+
+#define W0 %xmm1
+#define W4 %xmm2
+#define W8 %xmm3
+#define W12 %xmm4
+#define W16 %xmm5
+#define W20 %xmm6
+#define W24 %xmm7
+#define W28 %xmm8
+
+#define XMM_SHUFB_BSWAP %xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t) (((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD 16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ ENTRY(\name)
+
+ push %rbx
+ push %r12
+ push %rbp
+ mov %rsp, %rbp
+
+ sub $64, %rsp # allocate workspace
+ and $~15, %rsp # align stack
+
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ shl $6, CNT # multiply by 64
+ add BUF, CNT
+ mov CNT, BUFFER_END
+
+ lea K_XMM_AR(%rip), K_BASE
+ xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ # cleanup workspace
+ mov $8, %ecx
+ mov %rsp, %rdi
+ xor %eax, %eax
+ rep stosq
+
+ mov %rbp, %rsp # deallocate workspace
+ pop %rbp
+ pop %r12
+ pop %rbx
+ ret
+
+ ENDPROC(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+ INIT_REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ .set i, 0
+ .rept W_PRECALC_AHEAD
+ W_PRECALC i
+ .set i, (i+1)
+ .endr
+
+.align 4
+1:
+ RR F1,A,B,C,D,E,0
+ RR F1,D,E,A,B,C,2
+ RR F1,B,C,D,E,A,4
+ RR F1,E,A,B,C,D,6
+ RR F1,C,D,E,A,B,8
+
+ RR F1,A,B,C,D,E,10
+ RR F1,D,E,A,B,C,12
+ RR F1,B,C,D,E,A,14
+ RR F1,E,A,B,C,D,16
+ RR F1,C,D,E,A,B,18
+
+ RR F2,A,B,C,D,E,20
+ RR F2,D,E,A,B,C,22
+ RR F2,B,C,D,E,A,24
+ RR F2,E,A,B,C,D,26
+ RR F2,C,D,E,A,B,28
+
+ RR F2,A,B,C,D,E,30
+ RR F2,D,E,A,B,C,32
+ RR F2,B,C,D,E,A,34
+ RR F2,E,A,B,C,D,36
+ RR F2,C,D,E,A,B,38
+
+ RR F3,A,B,C,D,E,40
+ RR F3,D,E,A,B,C,42
+ RR F3,B,C,D,E,A,44
+ RR F3,E,A,B,C,D,46
+ RR F3,C,D,E,A,B,48
+
+ RR F3,A,B,C,D,E,50
+ RR F3,D,E,A,B,C,52
+ RR F3,B,C,D,E,A,54
+ RR F3,E,A,B,C,D,56
+ RR F3,C,D,E,A,B,58
+
+ add $64, BUFFER_PTR # move to the next 64-byte block
+ cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
+ cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
+
+ RR F4,A,B,C,D,E,60
+ RR F4,D,E,A,B,C,62
+ RR F4,B,C,D,E,A,64
+ RR F4,E,A,B,C,D,66
+ RR F4,C,D,E,A,B,68
+
+ RR F4,A,B,C,D,E,70
+ RR F4,D,E,A,B,C,72
+ RR F4,B,C,D,E,A,74
+ RR F4,E,A,B,C,D,76
+ RR F4,C,D,E,A,B,78
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), B
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ RESTORE_RENAMED_REGS
+ cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
+ jne 1b
+.endm
+
+.macro INIT_REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set T1, REG_T1
+ .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+ # order is important (REG_C is where it should be)
+ mov B, REG_B
+ mov D, REG_D
+ mov A, REG_A
+ mov E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES a, b
+ .set _T, \a
+ .set \a, \b
+ .set \b, _T
+.endm
+
+.macro F1 b, c, d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ xor \d, T1
+ and \b, T1
+ xor \d, T1
+.endm
+
+.macro F2 b, c, d
+ mov \d, T1
+ SWAP_REG_NAMES \d, T1
+ xor \c, T1
+ xor \b, T1
+.endm
+
+.macro F3 b, c ,d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ mov \b, T2
+ or \b, T1
+ and \c, T2
+ and \d, T1
+ or T2, T1
+.endm
+
+.macro F4 b, c, d
+ F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ * t1 = F(b, c, d); e += w(i)
+ * e += t1; b <<= 30; d += w(i+1);
+ * t1 = F(a, b, c);
+ * d += t1; a <<= 5;
+ * e += a;
+ * t1 = e; a >>= 7;
+ * t1 <<= 5;
+ * d += t1;
+ */
+.macro RR F, a, b, c, d, e, round
+ add WK(\round), \e
+ \F \b, \c, \d # t1 = F(b, c, d);
+ W_PRECALC (\round + W_PRECALC_AHEAD)
+ rol $30, \b
+ add T1, \e
+ add WK(\round + 1), \d
+
+ \F \a, \b, \c
+ W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+ rol $5, \a
+ add \a, \e
+ add T1, \d
+ ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
+
+ mov \e, T1
+ SWAP_REG_NAMES \e, T1
+
+ rol $5, T1
+ add T1, \d
+
+ # write: \a, \b
+ # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC r
+ .set i, \r
+
+ .if (i < 20)
+ .set K_XMM, 0
+ .elseif (i < 40)
+ .set K_XMM, 16
+ .elseif (i < 60)
+ .set K_XMM, 32
+ .elseif (i < 80)
+ .set K_XMM, 48
+ .endif
+
+ .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+ .set i, ((\r) % 80) # pre-compute for the next iteration
+ .if (i == 0)
+ W_PRECALC_RESET
+ .endif
+ W_PRECALC_00_15
+ .elseif (i<32)
+ W_PRECALC_16_31
+ .elseif (i < 80) // rounds 32-79
+ W_PRECALC_32_79
+ .endif
+.endm
+
+.macro W_PRECALC_RESET
+ .set W, W0
+ .set W_minus_04, W4
+ .set W_minus_08, W8
+ .set W_minus_12, W12
+ .set W_minus_16, W16
+ .set W_minus_20, W20
+ .set W_minus_24, W24
+ .set W_minus_28, W28
+ .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+ .set W_minus_32, W_minus_28
+ .set W_minus_28, W_minus_24
+ .set W_minus_24, W_minus_20
+ .set W_minus_20, W_minus_16
+ .set W_minus_16, W_minus_12
+ .set W_minus_12, W_minus_08
+ .set W_minus_08, W_minus_04
+ .set W_minus_04, W
+ .set W, W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+ .if ((i & 3) == 0)
+ movdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ pshufb XMM_SHUFB_BSWAP, W_TMP1
+ movdqa W_TMP1, W
+ .elseif ((i & 3) == 2)
+ paddd (K_BASE), W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ * instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+ # blended scheduling of vector and scalar instruction streams, one 4-wide
+ # vector iteration / 4 scalar rounds
+ .if ((i & 3) == 0)
+ movdqa W_minus_12, W
+ palignr $8, W_minus_16, W # w[i-14]
+ movdqa W_minus_04, W_TMP1
+ psrldq $4, W_TMP1 # w[i-3]
+ pxor W_minus_08, W
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W_TMP1
+ pxor W_TMP1, W
+ movdqa W, W_TMP2
+ movdqa W, W_TMP1
+ pslldq $12, W_TMP2
+ .elseif ((i & 3) == 2)
+ psrld $31, W
+ pslld $1, W_TMP1
+ por W, W_TMP1
+ movdqa W_TMP2, W
+ psrld $30, W_TMP2
+ pslld $2, W
+ .elseif ((i & 3) == 3)
+ pxor W, W_TMP1
+ pxor W_TMP2, W_TMP1
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+ .if ((i & 3) == 0)
+ movdqa W_minus_04, W_TMP1
+ pxor W_minus_28, W # W is W_minus_32 before xor
+ palignr $8, W_minus_08, W_TMP1
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W
+ pxor W_TMP1, W
+ movdqa W, W_TMP1
+ .elseif ((i & 3) == 2)
+ psrld $30, W
+ pslld $2, W_TMP1
+ por W, W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_SSSE3
+
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+ movdqu \a,\b
+.endm
+
+/* SSSE3 optimized implementation:
+ * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
+ * unsigned int rounds);
+ */
+SHA1_VECTOR_ASM sha1_transform_ssse3
+
+#ifdef CONFIG_AS_AVX
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+ .if ((i & 3) == 0)
+ vmovdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
+ .elseif ((i & 3) == 2)
+ vpaddd (K_BASE), W, W_TMP1
+ .elseif ((i & 3) == 3)
+ vmovdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
+ vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
+ vpxor W_minus_08, W, W
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ .elseif ((i & 3) == 1)
+ vpxor W_TMP1, W, W
+ vpslldq $12, W, W_TMP2
+ vpslld $1, W, W_TMP1
+ .elseif ((i & 3) == 2)
+ vpsrld $31, W, W
+ vpor W, W_TMP1, W_TMP1
+ vpslld $2, W_TMP2, W
+ vpsrld $30, W_TMP2, W_TMP2
+ .elseif ((i & 3) == 3)
+ vpxor W, W_TMP1, W_TMP1
+ vpxor W_TMP2, W_TMP1, W
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+ vpxor W_minus_28, W, W # W is W_minus_32 before xor
+ .elseif ((i & 3) == 1)
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ vpxor W_TMP1, W, W
+ .elseif ((i & 3) == 2)
+ vpslld $2, W, W_TMP1
+ vpsrld $30, W, W
+ vpor W, W_TMP1, W
+ .elseif ((i & 3) == 3)
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+ vmovdqu \a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
+ * unsigned int rounds);
+ */
+SHA1_VECTOR_ASM sha1_transform_avx
+
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha1_ssse3_glue.c b/marvell/linux/arch/x86/crypto/sha1_ssse3_glue.c
new file mode 100644
index 0000000..639d4c2
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha1_ssse3_glue.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * Supplemental SSE3 instructions.
+ *
+ * This file is based on sha1_generic.c
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha1_base.h>
+#include <asm/simd.h>
+
+typedef void (sha1_transform_fn)(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha1_transform_fn *sha1_xform)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ if (!crypto_simd_usable() ||
+ (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
+ return crypto_sha1_update(desc, data, len);
+
+ /* make sure casting to sha1_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
+
+ kernel_fpu_begin();
+ sha1_base_do_update(desc, data, len,
+ (sha1_block_fn *)sha1_xform);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
+{
+ if (!crypto_simd_usable())
+ return crypto_sha1_finup(desc, data, len, out);
+
+ kernel_fpu_begin();
+ if (len)
+ sha1_base_do_update(desc, data, len,
+ (sha1_block_fn *)sha1_xform);
+ sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
+ kernel_fpu_end();
+
+ return sha1_base_finish(desc, out);
+}
+
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ssse3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ssse3_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_ssse3_update,
+ .final = sha1_ssse3_final,
+ .finup = sha1_ssse3_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shash(&sha1_ssse3_alg);
+ return 0;
+}
+
+static void unregister_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shash(&sha1_ssse3_alg);
+}
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx_update,
+ .final = sha1_avx_final,
+ .finup = sha1_avx_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ pr_info("AVX detected but unusable.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int register_sha1_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shash(&sha1_avx_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shash(&sha1_avx_alg);
+}
+
+#else /* CONFIG_AS_AVX */
+static inline int register_sha1_avx(void) { return 0; }
+static inline void unregister_sha1_avx(void) { }
+#endif /* CONFIG_AS_AVX */
+
+
+#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+ && boot_cpu_has(X86_FEATURE_BMI1)
+ && boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static void sha1_apply_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds)
+{
+ /* Select the optimal transform based on data block size */
+ if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(digest, data, rounds);
+ else
+ sha1_transform_avx(digest, data, rounds);
+}
+
+static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx2_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx2_update,
+ .final = sha1_avx2_final,
+ .finup = sha1_avx2_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shash(&sha1_avx2_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shash(&sha1_avx2_alg);
+}
+
+#else
+static inline int register_sha1_avx2(void) { return 0; }
+static inline void unregister_sha1_avx2(void) { }
+#endif
+
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ni_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_ni_update,
+ .final = sha1_ni_final,
+ .finup = sha1_ni_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-ni",
+ .cra_priority = 250,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shash(&sha1_ni_alg);
+ return 0;
+}
+
+static void unregister_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shash(&sha1_ni_alg);
+}
+
+#else
+static inline int register_sha1_ni(void) { return 0; }
+static inline void unregister_sha1_ni(void) { }
+#endif
+
+static int __init sha1_ssse3_mod_init(void)
+{
+ if (register_sha1_ssse3())
+ goto fail;
+
+ if (register_sha1_avx()) {
+ unregister_sha1_ssse3();
+ goto fail;
+ }
+
+ if (register_sha1_avx2()) {
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
+ }
+
+ if (register_sha1_ni()) {
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
+ return -ENODEV;
+}
+
+static void __exit sha1_ssse3_mod_fini(void)
+{
+ unregister_sha1_ni();
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+}
+
+module_init(sha1_ssse3_mod_init);
+module_exit(sha1_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS_CRYPTO("sha1");
+MODULE_ALIAS_CRYPTO("sha1-ssse3");
+MODULE_ALIAS_CRYPTO("sha1-avx");
+MODULE_ALIAS_CRYPTO("sha1-avx2");
+#ifdef CONFIG_AS_SHA1_NI
+MODULE_ALIAS_CRYPTO("sha1-ni");
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha256-avx-asm.S b/marvell/linux/arch/x86/crypto/sha256-avx-asm.S
new file mode 100644
index 0000000..001bbcf
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha256-avx-asm.S
@@ -0,0 +1,502 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+ shld $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpsrld $7, XTMP1, XTMP2
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpslld $(32-7), XTMP1, XTMP3
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ vpsrld $18, XTMP1, XTMP2 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpslld $(32-18), XTMP1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP1, XTMP3, XTMP3 #
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP3, XTMP2, XTMP2 #
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP3, XTMP2, XTMP2
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER #
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_avx)
+.align 32
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp # allocate stack space
+ and $~15, %rsp # align stack pointer
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz done_hash
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+loop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+loop1:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 1*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 2*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 3*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne loop1
+
+ mov $2, SRND
+loop2:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd 1*16(TBL), X1, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ sub $1, SRND
+ jne loop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne loop0
+
+done_hash:
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ ret
+ENDPROC(sha256_transform_avx)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha256-avx2-asm.S b/marvell/linux/arch/x86/crypto/sha256-avx2-asm.S
new file mode 100644
index 0000000..1420db1
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha256-avx2-asm.S
@@ -0,0 +1,771 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+c = %ecx
+d = %r8d
+e = %edx # clobbers NUM_BLKS
+y3 = %esi # clobbers INP
+
+SRND = CTX # SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE = 0
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_CTX_SIZE = 8
+_RSP_SIZE = 8
+
+_XFER = 0
+_XMM_SAVE = _XFER + _XFER_SIZE
+_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
+_INP = _INP_END + _INP_END_SIZE
+_CTX = _INP + _INP_SIZE
+_RSP = _CTX + _CTX_SIZE
+STACK_SIZE = _RSP + _RSP_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+ X_ = X0
+ X0 = X1
+ X1 = X2
+ X2 = X3
+ X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ vpsrld $7, XTMP1, XTMP2
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ vpslld $(32-7), XTMP1, XTMP3
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
+
+ vpsrld $18, XTMP1, XTMP2
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 1*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ vpslld $(32-18), XTMP1, XTMP1
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+
+ vpxor XTMP1, XTMP3, XTMP3
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ offset = \disp + 2*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ vpxor XTMP3, XTMP2, XTMP2
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a ,T1 # T1 = (a >> 2) # S0
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1,h # h = k + w + h + S0 # --
+ add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3,h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 3*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP3, XTMP2, XTMP2
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*1 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*2 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*3 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_rorx)
+.align 32
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ mov %rsp, %rax
+ subq $STACK_SIZE, %rsp
+ and $-32, %rsp # align rsp to 32 byte boundary
+ mov %rax, _RSP(%rsp)
+
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz done_hash
+ lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ cmp NUM_BLKS, INP
+ je only_one_block
+
+ ## load initial digest
+ mov (CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+
+loop0:
+ ## Load first 16 dwords from two blocks
+ VMOVDQ 0*32(INP),XTMP0
+ VMOVDQ 1*32(INP),XTMP1
+ VMOVDQ 2*32(INP),XTMP2
+ VMOVDQ 3*32(INP),XTMP3
+
+ ## byte swap data
+ vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
+ vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
+ vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
+ vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
+
+ ## transpose data into high/low halves
+ vperm2i128 $0x20, XTMP2, XTMP0, X0
+ vperm2i128 $0x31, XTMP2, XTMP0, X1
+ vperm2i128 $0x20, XTMP3, XTMP1, X2
+ vperm2i128 $0x31, XTMP3, XTMP1, X3
+
+last_block_enter:
+ add $64, INP
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 12 each
+ xor SRND, SRND
+
+.align 16
+loop1:
+ vpaddd K256+0*32(SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 0*32
+
+ vpaddd K256+1*32(SRND), X0, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 1*32
+
+ vpaddd K256+2*32(SRND), X0, XFER
+ vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 2*32
+
+ vpaddd K256+3*32(SRND), X0, XFER
+ vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 3*32
+
+ add $4*32, SRND
+ cmp $3*4*32, SRND
+ jb loop1
+
+loop2:
+ ## Do last 16 rounds with no scheduling
+ vpaddd K256+0*32(SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS _XFER + 0*32
+
+ vpaddd K256+1*32(SRND), X1, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS _XFER + 1*32
+ add $2*32, SRND
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ cmp $4*4*32, SRND
+ jb loop2
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ ja done_hash
+
+ #### Do second block using previously scheduled results
+ xor SRND, SRND
+.align 16
+loop3:
+ DO_4ROUNDS _XFER + 0*32 + 16
+ DO_4ROUNDS _XFER + 1*32 + 16
+ add $2*32, SRND
+ cmp $4*4*32, SRND
+ jb loop3
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+ add $64, INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ jb loop0
+ ja done_hash
+
+do_last_block:
+ VMOVDQ 0*16(INP),XWORD0
+ VMOVDQ 1*16(INP),XWORD1
+ VMOVDQ 2*16(INP),XWORD2
+ VMOVDQ 3*16(INP),XWORD3
+
+ vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
+ vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
+ vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
+ vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+ jmp last_block_enter
+
+only_one_block:
+
+ ## load initial digest
+ mov (4*0)(CTX),a
+ mov (4*1)(CTX),b
+ mov (4*2)(CTX),c
+ mov (4*3)(CTX),d
+ mov (4*4)(CTX),e
+ mov (4*5)(CTX),f
+ mov (4*6)(CTX),g
+ mov (4*7)(CTX),h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+ jmp do_last_block
+
+done_hash:
+
+ mov _RSP(%rsp), %rsp
+
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ ret
+ENDPROC(sha256_transform_rorx)
+
+.section .rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha256-ssse3-asm.S b/marvell/linux/arch/x86/crypto/sha256-ssse3-asm.S
new file mode 100644
index 0000000..c6c05ed
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha256-ssse3-asm.S
@@ -0,0 +1,511 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ MOVDQ \p2, \p1
+ pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+ movdqa X3, XTMP0
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ palignr $4, X2, XTMP0 # XTMP0 = W[-7]
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa X1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ palignr $4, X0, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
+ movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pslld $(32-7), XTMP1 #
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ psrld $7, XTMP2 #
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ pslld $(32-18), XTMP3 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ psrld $18, XTMP2 #
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pxor XTMP4, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP2
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ pxor XTMP3, XTMP2 #
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, X0 # X0 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_ssse3)
+.align 32
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $~15, %rsp
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz done_hash
+ add INP, NUM_BLKS
+ mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ movdqa _SHUF_00BA(%rip), SHUF_00BA
+ movdqa _SHUF_DC00(%rip), SHUF_DC00
+
+loop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+loop1:
+ movdqa (TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 1*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 2*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 3*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne loop1
+
+ mov $2, SRND
+loop2:
+ paddd (TBL), X0
+ movdqa X0, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd 1*16(TBL), X1
+ movdqa X1, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X2, X0
+ movdqa X3, X1
+
+ sub $1, SRND
+ jne loop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne loop0
+
+done_hash:
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+
+ ret
+ENDPROC(sha256_transform_ssse3)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/marvell/linux/arch/x86/crypto/sha256_ni_asm.S b/marvell/linux/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 0000000..fb58f58
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,355 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSGTMP0 %xmm3
+#define MSGTMP1 %xmm4
+#define MSGTMP2 %xmm5
+#define MSGTMP3 %xmm6
+#define MSGTMP4 %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(DIGEST_PTR), STATE0
+ movdqu 1*16(DIGEST_PTR), STATE1
+
+ pshufd $0xB1, STATE0, STATE0 /* CDAB */
+ pshufd $0x1B, STATE1, STATE1 /* EFGH */
+ movdqa STATE0, MSGTMP4
+ palignr $8, STATE1, STATE0 /* ABEF */
+ pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP0
+ paddd 0*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP1
+ paddd 1*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP2
+ paddd 2*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP3
+ paddd 3*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 16-19 */
+ movdqa MSGTMP0, MSG
+ paddd 4*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 20-23 */
+ movdqa MSGTMP1, MSG
+ paddd 5*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 24-27 */
+ movdqa MSGTMP2, MSG
+ paddd 6*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 28-31 */
+ movdqa MSGTMP3, MSG
+ paddd 7*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 32-35 */
+ movdqa MSGTMP0, MSG
+ paddd 8*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 36-39 */
+ movdqa MSGTMP1, MSG
+ paddd 9*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 40-43 */
+ movdqa MSGTMP2, MSG
+ paddd 10*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 44-47 */
+ movdqa MSGTMP3, MSG
+ paddd 11*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 48-51 */
+ movdqa MSGTMP0, MSG
+ paddd 12*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 52-55 */
+ movdqa MSGTMP1, MSG
+ paddd 13*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 56-59 */
+ movdqa MSGTMP2, MSG
+ paddd 14*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 60-63 */
+ movdqa MSGTMP3, MSG
+ paddd 15*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, STATE0, STATE0 /* FEBA */
+ pshufd $0xB1, STATE1, STATE1 /* DCHG */
+ movdqa STATE0, MSGTMP4
+ pblendw $0xF0, STATE1, STATE0 /* DCBA */
+ palignr $8, MSGTMP4, STATE1 /* HGFE */
+
+ movdqu STATE0, 0*16(DIGEST_PTR)
+ movdqu STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+ ret
+ENDPROC(sha256_ni_transform)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/marvell/linux/arch/x86/crypto/sha256_ssse3_glue.c b/marvell/linux/arch/x86/crypto/sha256_ssse3_glue.c
new file mode 100644
index 0000000..f9aff31
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha256_ssse3_glue.c
@@ -0,0 +1,433 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA256 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha256_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation.
+ *
+ * Author:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha256_base.h>
+#include <linux/string.h>
+#include <asm/simd.h>
+
+asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
+ u64 rounds);
+typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
+
+static int _sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha256_transform_fn *sha256_xform)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ if (!crypto_simd_usable() ||
+ (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
+ return crypto_sha256_update(desc, data, len);
+
+ /* make sure casting to sha256_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
+
+ kernel_fpu_begin();
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_xform);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
+{
+ if (!crypto_simd_usable())
+ return crypto_sha256_finup(desc, data, len, out);
+
+ kernel_fpu_begin();
+ if (len)
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_xform);
+ sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
+ kernel_fpu_end();
+
+ return sha256_base_finish(desc, out);
+}
+
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return _sha256_update(desc, data, len, sha256_transform_ssse3);
+}
+
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_ssse3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ssse3_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_ssse3_update,
+ .final = sha256_ssse3_final,
+ .finup = sha256_ssse3_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_ssse3_update,
+ .final = sha256_ssse3_final,
+ .finup = sha256_ssse3_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+}
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return _sha256_update(desc, data, len, sha256_transform_avx);
+}
+
+static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_avx);
+}
+
+static int sha256_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ pr_info("AVX detected but unusable.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int register_sha256_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+ return 0;
+}
+
+static void unregister_sha256_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+}
+
+#else
+static inline int register_sha256_avx(void) { return 0; }
+static inline void unregister_sha256_avx(void) { }
+#endif
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return _sha256_update(desc, data, len, sha256_transform_rorx);
+}
+
+static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_rorx);
+}
+
+static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx2_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha256_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha256_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+}
+
+#else
+static inline int register_sha256_avx2(void) { return 0; }
+static inline void unregister_sha256_avx2(void) { }
+#endif
+
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+ u64 rounds); /*unsigned int rounds);*/
+
+static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return _sha256_update(desc, data, len, sha256_ni_transform);
+}
+
+static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_ni_transform);
+}
+
+static int sha256_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ni_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-ni",
+ .cra_priority = 250,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ni",
+ .cra_priority = 250,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+ return 0;
+}
+
+static void unregister_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+}
+
+#else
+static inline int register_sha256_ni(void) { return 0; }
+static inline void unregister_sha256_ni(void) { }
+#endif
+
+static int __init sha256_ssse3_mod_init(void)
+{
+ if (register_sha256_ssse3())
+ goto fail;
+
+ if (register_sha256_avx()) {
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ if (register_sha256_avx2()) {
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ if (register_sha256_ni()) {
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
+ return -ENODEV;
+}
+
+static void __exit sha256_ssse3_mod_fini(void)
+{
+ unregister_sha256_ni();
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+}
+
+module_init(sha256_ssse3_mod_init);
+module_exit(sha256_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha256-ssse3");
+MODULE_ALIAS_CRYPTO("sha256-avx");
+MODULE_ALIAS_CRYPTO("sha256-avx2");
+MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha224-ssse3");
+MODULE_ALIAS_CRYPTO("sha224-avx");
+MODULE_ALIAS_CRYPTO("sha224-avx2");
+#ifdef CONFIG_AS_SHA256_NI
+MODULE_ALIAS_CRYPTO("sha256-ni");
+MODULE_ALIAS_CRYPTO("sha224-ni");
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha512-avx-asm.S b/marvell/linux/arch/x86/crypto/sha512-avx-asm.S
new file mode 100644
index 0000000..39235fe
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha512-avx-asm.S
@@ -0,0 +1,426 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro RORQ p1 p2
+ # shld is faster than ror on Sandybridge
+ shld $(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ RORQ tmp0, 23 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ RORQ tmp0, 5 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+
+ idx = \rnd - 2
+ vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
+ idx = \rnd - 15
+ vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov f_64, T1
+ vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
+ mov e_64, tmp0
+ vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
+ xor g_64, T1
+ RORQ tmp0, 23 # 41
+ vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
+ and e_64, T1
+ xor e_64, tmp0
+ vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1#
+ vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
+ RORQ tmp0, 4 # 18
+ vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
+ xor e_64, tmp0
+ mov a_64, T2
+ add h_64, T1
+ vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
+ mov a_64, tmp0
+ xor c_64, T2
+ vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
+ and c_64, tmp0
+ and b_64, T2
+ vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+ xor tmp0, T2
+ mov a_64, tmp0
+ vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
+ RORQ tmp0, 5 # 39
+ vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+ # W[t-15]>>7 ^ W[t-15]<<63
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
+ add tmp0, h_64
+ RotateState
+ vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+ # W[t-2]<<25
+ mov f_64, T1
+ vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
+ mov e_64, tmp0
+ xor g_64, T1
+ idx = \rnd - 16
+ vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
+ idx = \rnd - 7
+ vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ RORQ tmp0, 23 # 41
+ and e_64, T1
+ xor e_64, tmp0
+ xor g_64, T1
+ vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
+ RORQ tmp0, 4 # 18
+ vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+ xor e_64, tmp0
+ vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+ # s0(W[t-15]) + W[t-16]
+ mov a_64, T2
+ add h_64, T1
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ idx = \rnd
+ vmovdqa %xmm0, W_t(idx) # Store W[t]
+ vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ mov a_64, tmp0
+ xor c_64, T2
+ and c_64, tmp0
+ and b_64, T2
+ xor tmp0, T2
+ mov a_64, tmp0
+ RORQ tmp0, 5 # 39
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ add tmp0, h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(void* D, const void* M, u64 L)
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_avx)
+ cmp $0, msglen
+ je nowork
+
+ # Allocate Stack Space
+ mov %rsp, %rax
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+ mov %rax, frame_RSPSAVE(%rsp)
+
+ # Save GPRs
+ mov %rbx, frame_GPRSAVE(%rsp)
+ mov %r12, frame_GPRSAVE +8*1(%rsp)
+ mov %r13, frame_GPRSAVE +8*2(%rsp)
+ mov %r14, frame_GPRSAVE +8*3(%rsp)
+ mov %r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+ # Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_avx t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz updateblock
+
+ # Restore GPRs
+ mov frame_GPRSAVE(%rsp), %rbx
+ mov frame_GPRSAVE +8*1(%rsp), %r12
+ mov frame_GPRSAVE +8*2(%rsp), %r13
+ mov frame_GPRSAVE +8*3(%rsp), %r14
+ mov frame_GPRSAVE +8*4(%rsp), %r15
+
+ # Restore Stack Pointer
+ mov frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+ ret
+ENDPROC(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha512-avx2-asm.S b/marvell/linux/arch/x86/crypto/sha512-avx2-asm.S
new file mode 100644
index 0000000..b16d560
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha512-avx2-asm.S
@@ -0,0 +1,752 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER = YTMP0
+
+BYTE_FLIP_MASK = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1 = %rdi
+CTX2 = %r12
+# 2nd arg
+INP = %rsi
+# 3rd arg
+NUM_BLKS = %rdx
+
+c = %rcx
+d = %r8
+e = %rdx
+y3 = %rsi
+
+TBL = %rdi # clobbers CTX1
+
+a = %rax
+b = %rbx
+
+f = %r9
+g = %r10
+h = %r11
+old_h = %r11
+
+T1 = %r12 # clobbers CTX2
+y0 = %r13
+y1 = %r14
+y2 = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_RSPSAVE = frame_CTX + CTX_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+ Y_ = Y_0
+ Y_0 = Y_1
+ Y_1 = Y_2
+ Y_2 = Y_3
+ Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+ # Rotate symbols a..h right
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+ vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
+ vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+ # Extract w[t-7]
+ MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
+ # Calculate w[t-16] + w[t-7]
+ vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
+ # Extract w[t-15]
+ MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
+
+ # Calculate sigma0
+
+ # Calculate w[t-15] ror 1
+ vpsrlq $1, YTMP1, YTMP2
+ vpsllq $(64-1), YTMP1, YTMP3
+ vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
+ # Calculate w[t-15] shr 7
+ vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add frame_XFER(%rsp),h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ # Calculate w[t-15] ror 8
+ vpsrlq $8, YTMP1, YTMP2
+ vpsllq $(64-8), YTMP1, YTMP1
+ vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
+ # XOR the three components
+ vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+ vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
+
+
+ # Add three components, w[t-16], w[t-7] and sigma0
+ vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
+ # Move to appropriate lanes for calculating w[16] and w[17]
+ vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
+ # Move to appropriate lanes for calculating w[18] and w[19]
+ vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+ # Calculate w[16] and w[17] in both 128 bit lanes
+
+ # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+ vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
+ vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
+
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+
+################################### RND N + 2 #########################################
+
+ vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
+ vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+ vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
+ vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+ # Add sigma1 to the other compunents to get w[16] and w[17]
+ vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
+
+ # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+ vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
+
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
+ vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+ vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
+ vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+ # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+ # to newly calculated sigma1 to get w[18] and w[19]
+ vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
+
+ # Form w[19, w[18], w17], w[16]
+ vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+ rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 2 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_rorx)
+ # Allocate Stack Space
+ mov %rsp, %rax
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+ mov %rax, frame_RSPSAVE(%rsp)
+
+ # Save GPRs
+ mov %rbx, 8*0+frame_GPRSAVE(%rsp)
+ mov %r12, 8*1+frame_GPRSAVE(%rsp)
+ mov %r13, 8*2+frame_GPRSAVE(%rsp)
+ mov %r14, 8*3+frame_GPRSAVE(%rsp)
+ mov %r15, 8*4+frame_GPRSAVE(%rsp)
+
+ shl $7, NUM_BLKS # convert to bytes
+ jz done_hash
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, frame_INPEND(%rsp)
+
+ ## load initial digest
+ mov 8*0(CTX1), a
+ mov 8*1(CTX1), b
+ mov 8*2(CTX1), c
+ mov 8*3(CTX1), d
+ mov 8*4(CTX1), e
+ mov 8*5(CTX1), f
+ mov 8*6(CTX1), g
+ mov 8*7(CTX1), h
+
+ # save %rdi (CTX) before it gets clobbered
+ mov %rdi, frame_CTX(%rsp)
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+loop0:
+ lea K512(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+ mov INP, frame_INP(%rsp)
+
+ ## schedule 64 input dwords, by doing 12 rounds of 4 each
+ movq $4, frame_SRND(%rsp)
+
+.align 16
+loop1:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 1*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 2*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 3*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(4*32), TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ subq $1, frame_SRND(%rsp)
+ jne loop1
+
+ movq $2, frame_SRND(%rsp)
+loop2:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ DO_4ROUNDS
+ vpaddq 1*32(TBL), Y_1, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(2*32), TBL
+ DO_4ROUNDS
+
+ vmovdqa Y_2, Y_0
+ vmovdqa Y_3, Y_1
+
+ subq $1, frame_SRND(%rsp)
+ jne loop2
+
+ mov frame_CTX(%rsp), CTX2
+ addm 8*0(CTX2), a
+ addm 8*1(CTX2), b
+ addm 8*2(CTX2), c
+ addm 8*3(CTX2), d
+ addm 8*4(CTX2), e
+ addm 8*5(CTX2), f
+ addm 8*6(CTX2), g
+ addm 8*7(CTX2), h
+
+ mov frame_INP(%rsp), INP
+ add $128, INP
+ cmp frame_INPEND(%rsp), INP
+ jne loop0
+
+done_hash:
+
+# Restore GPRs
+ mov 8*0+frame_GPRSAVE(%rsp), %rbx
+ mov 8*1+frame_GPRSAVE(%rsp), %r12
+ mov 8*2+frame_GPRSAVE(%rsp), %r13
+ mov 8*3+frame_GPRSAVE(%rsp), %r14
+ mov 8*4+frame_GPRSAVE(%rsp), %r15
+
+ # Restore Stack Pointer
+ mov frame_RSPSAVE(%rsp), %rsp
+ ret
+ENDPROC(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+ .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+ .octa 0x00000000000000000000000000000000
+ .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
+#endif
diff --git a/marvell/linux/arch/x86/crypto/sha512-ssse3-asm.S b/marvell/linux/arch/x86/crypto/sha512-ssse3-asm.S
new file mode 100644
index 0000000..66bbd90
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha512-ssse3-asm.S
@@ -0,0 +1,424 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ ror $23, tmp0 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ ror $5, tmp0 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+ # For clarity, integer instructions (for the rounds calculation) are indented
+ # by one tab. Vectored instructions (for the message scheduler) are indented
+ # by two tabs.
+
+ mov f_64, T1
+ idx = \rnd -2
+ movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
+ xor g_64, T1
+ and e_64, T1
+ movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1
+ idx = \rnd - 15
+ movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov e_64, tmp0
+ ror $23, tmp0 # 41
+ movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
+ xor e_64, tmp0
+ ror $4, tmp0 # 18
+ psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
+ xor e_64, tmp0
+ ror $14, tmp0 # 14
+ psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
+ add tmp0, T1
+ add h_64, T1
+ pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov a_64, T2
+ xor c_64, T2
+ pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and b_64, T2
+ mov a_64, tmp0
+ psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and c_64, tmp0
+ xor tmp0, T2
+ psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov a_64, tmp0
+ ror $5, tmp0 # 39
+ pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor a_64, tmp0
+ ror $6, tmp0 # 34
+ pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor a_64, tmp0
+ ror $28, tmp0 # 28
+ psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add tmp0, T2
+ add T1, d_64
+ psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea (T1, T2), h_64
+ RotateState
+ movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
+ mov f_64, T1
+ xor g_64, T1
+ movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
+ and e_64, T1
+ xor g_64, T1
+ psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ mov e_64, tmp0
+ psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
+ ror $23, tmp0 # 41
+ xor e_64, tmp0
+ pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
+ ror $4, tmp0 # 18
+ xor e_64, tmp0
+ pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
+ ror $14, tmp0 # 14
+ add tmp0, T1
+ psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add h_64, T1
+ mov a_64, T2
+ psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor c_64, T2
+ and b_64, T2
+ pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
+ mov a_64, tmp0
+ and c_64, tmp0
+ idx = \rnd - 7
+ movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ xor tmp0, T2
+ pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
+ mov a_64, tmp0
+ paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror $5, tmp0 # 39
+ idx =\rnd-16
+ paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor a_64, tmp0
+ paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror $6, tmp0 # 34
+ movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
+ xor a_64, tmp0
+ paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
+ ror $28, tmp0 # 28
+ idx = \rnd
+ movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ add tmp0, T2
+ add T1, d_64
+ lea (T1, T2), h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks.
+########################################################################
+ENTRY(sha512_transform_ssse3)
+
+ cmp $0, msglen
+ je nowork
+
+ # Allocate Stack Space
+ mov %rsp, %rax
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+ mov %rax, frame_RSPSAVE(%rsp)
+
+ # Save GPRs
+ mov %rbx, frame_GPRSAVE(%rsp)
+ mov %r12, frame_GPRSAVE +8*1(%rsp)
+ mov %r13, frame_GPRSAVE +8*2(%rsp)
+ mov %r14, frame_GPRSAVE +8*3(%rsp)
+ mov %r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+# Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ movdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ movdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz updateblock
+
+ # Restore GPRs
+ mov frame_GPRSAVE(%rsp), %rbx
+ mov frame_GPRSAVE +8*1(%rsp), %r12
+ mov frame_GPRSAVE +8*2(%rsp), %r13
+ mov frame_GPRSAVE +8*3(%rsp), %r14
+ mov frame_GPRSAVE +8*4(%rsp), %r15
+
+ # Restore Stack Pointer
+ mov frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+ ret
+ENDPROC(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/marvell/linux/arch/x86/crypto/sha512_ssse3_glue.c b/marvell/linux/arch/x86/crypto/sha512_ssse3_glue.c
new file mode 100644
index 0000000..458356a
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/sha512_ssse3_glue.c
@@ -0,0 +1,349 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA512 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha512_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <asm/simd.h>
+
+asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
+ u64 rounds);
+
+typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
+
+static int sha512_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha512_transform_fn *sha512_xform)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ if (!crypto_simd_usable() ||
+ (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+ return crypto_sha512_update(desc, data, len);
+
+ /* make sure casting to sha512_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
+
+ kernel_fpu_begin();
+ sha512_base_do_update(desc, data, len,
+ (sha512_block_fn *)sha512_xform);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sha512_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
+{
+ if (!crypto_simd_usable())
+ return crypto_sha512_finup(desc, data, len, out);
+
+ kernel_fpu_begin();
+ if (len)
+ sha512_base_do_update(desc, data, len,
+ (sha512_block_fn *)sha512_xform);
+ sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
+ kernel_fpu_end();
+
+ return sha512_base_finish(desc, out);
+}
+
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_ssse3);
+}
+
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_ssse3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_ssse3_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_ssse3_update,
+ .final = sha512_ssse3_final,
+ .finup = sha512_ssse3_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_ssse3_update,
+ .final = sha512_ssse3_final,
+ .finup = sha512_ssse3_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+}
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
+ u64 rounds);
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ pr_info("AVX detected but unusable.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_avx);
+}
+
+static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_avx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha512_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+}
+#else
+static inline int register_sha512_avx(void) { return 0; }
+static inline void unregister_sha512_avx(void) { }
+#endif
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+ u64 rounds);
+
+static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_rorx);
+}
+
+static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_rorx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx2_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha512_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+}
+#else
+static inline int register_sha512_avx2(void) { return 0; }
+static inline void unregister_sha512_avx2(void) { }
+#endif
+
+static int __init sha512_ssse3_mod_init(void)
+{
+
+ if (register_sha512_ssse3())
+ goto fail;
+
+ if (register_sha512_avx()) {
+ unregister_sha512_ssse3();
+ goto fail;
+ }
+
+ if (register_sha512_avx2()) {
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
+ return -ENODEV;
+}
+
+static void __exit sha512_ssse3_mod_fini(void)
+{
+ unregister_sha512_avx2();
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
+}
+
+module_init(sha512_ssse3_mod_init);
+module_exit(sha512_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha512-ssse3");
+MODULE_ALIAS_CRYPTO("sha512-avx");
+MODULE_ALIAS_CRYPTO("sha512-avx2");
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha384-ssse3");
+MODULE_ALIAS_CRYPTO("sha384-avx");
+MODULE_ALIAS_CRYPTO("sha384-avx2");
diff --git a/marvell/linux/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 0000000..698b8f2
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,456 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "twofish-avx-x86_64-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+/* structure of crypto context */
+#define s0 0
+#define s1 1024
+#define s2 2048
+#define s3 3072
+#define w 4096
+#define k 4128
+
+/**********************************************************************
+ 8-way AVX twofish
+ **********************************************************************/
+#define CTX %rdi
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
+#define RR %xmm15
+
+#define RID1 %r13
+#define RID1d %r13d
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RGS1 %r8
+#define RGS1d %r8d
+#define RGS2 %r9
+#define RGS2d %r9d
+#define RGS3 %r10
+#define RGS3d %r10d
+
+
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+ movzbl src ## bl, RID1d; \
+ movzbl src ## bh, RID2d; \
+ shrq $16, src; \
+ movl t0(CTX, RID1, 4), dst ## d; \
+ movl t1(CTX, RID2, 4), RID2d; \
+ movzbl src ## bl, RID1d; \
+ xorl RID2d, dst ## d; \
+ movzbl src ## bh, RID2d; \
+ interleave_op(il_reg); \
+ xorl t2(CTX, RID1, 4), dst ## d; \
+ xorl t3(CTX, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
+ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
+ \
+ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
+ shlq $32, RGS2; \
+ orq RGS1, RGS2; \
+ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
+ shlq $32, RGS1; \
+ orq RGS1, RGS3;
+
+#define round_head_2(a, b, x1, y1, x2, y2) \
+ vmovq b ## 1, RGI3; \
+ vpextrq $1, b ## 1, RGI4; \
+ \
+ G(RGI1, RGI2, x1, s0, s1, s2, s3); \
+ vmovq a ## 2, RGI1; \
+ vpextrq $1, a ## 2, RGI2; \
+ vmovq RGS2, x1; \
+ vpinsrq $1, RGS3, x1, x1; \
+ \
+ G(RGI3, RGI4, y1, s1, s2, s3, s0); \
+ vmovq b ## 2, RGI3; \
+ vpextrq $1, b ## 2, RGI4; \
+ vmovq RGS2, y1; \
+ vpinsrq $1, RGS3, y1, y1; \
+ \
+ G(RGI1, RGI2, x2, s0, s1, s2, s3); \
+ vmovq RGS2, x2; \
+ vpinsrq $1, RGS3, x2, x2; \
+ \
+ G(RGI3, RGI4, y2, s1, s2, s3, s0); \
+ vmovq RGS2, y2; \
+ vpinsrq $1, RGS3, y2, y2;
+
+#define encround_tail(a, b, c, d, x, y, prerotate) \
+ vpaddd x, y, x; \
+ vpaddd x, RK1, RT;\
+ prerotate(b); \
+ vpxor RT, c, c; \
+ vpaddd y, x, y; \
+ vpaddd y, RK2, y; \
+ vpsrld $1, c, RT; \
+ vpslld $(32 - 1), c, c; \
+ vpor c, RT, c; \
+ vpxor d, y, d; \
+
+#define decround_tail(a, b, c, d, x, y, prerotate) \
+ vpaddd x, y, x; \
+ vpaddd x, RK1, RT;\
+ prerotate(a); \
+ vpxor RT, c, c; \
+ vpaddd y, x, y; \
+ vpaddd y, RK2, y; \
+ vpxor d, y, d; \
+ vpsrld $1, d, y; \
+ vpslld $(32 - 1), d, d; \
+ vpor d, y, d; \
+
+#define rotate_1l(x) \
+ vpslld $1, x, RR; \
+ vpsrld $(32 - 1), x, x; \
+ vpor x, RR, x;
+
+#define preload_rgi(c) \
+ vmovq c, RGI1; \
+ vpextrq $1, c, RGI2;
+
+#define encrypt_round(n, a, b, c, d, preload, prerotate) \
+ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
+ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
+ round_head_2(a, b, RX0, RY0, RX1, RY1); \
+ encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
+ preload(c ## 1); \
+ encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
+
+#define decrypt_round(n, a, b, c, d, preload, prerotate) \
+ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
+ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
+ round_head_2(a, b, RX0, RY0, RX1, RY1); \
+ decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
+ preload(c ## 1); \
+ decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
+
+#define encrypt_cycle(n) \
+ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
+ encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
+
+#define encrypt_cycle_last(n) \
+ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
+ encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
+
+#define decrypt_cycle(n) \
+ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
+ decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
+
+#define decrypt_cycle_last(n) \
+ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
+ decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3;
+
+.align 8
+__twofish_enc_blk8:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ */
+
+ vmovdqu w(CTX), RK1;
+
+ pushq %r13;
+ pushq %rbx;
+ pushq %rcx;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ preload_rgi(RA1);
+ rotate_1l(RD1);
+ inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+ rotate_1l(RD2);
+
+ encrypt_cycle(0);
+ encrypt_cycle(1);
+ encrypt_cycle(2);
+ encrypt_cycle(3);
+ encrypt_cycle(4);
+ encrypt_cycle(5);
+ encrypt_cycle(6);
+ encrypt_cycle_last(7);
+
+ vmovdqu (w+4*4)(CTX), RK1;
+
+ popq %rcx;
+ popq %rbx;
+ popq %r13;
+
+ outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+
+ ret;
+ENDPROC(__twofish_enc_blk8)
+
+.align 8
+__twofish_dec_blk8:
+ /* input:
+ * %rdi: ctx, CTX
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
+ */
+
+ vmovdqu (w+4*4)(CTX), RK1;
+
+ pushq %r13;
+ pushq %rbx;
+
+ inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ preload_rgi(RC1);
+ rotate_1l(RA1);
+ inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+ rotate_1l(RA2);
+
+ decrypt_cycle(7);
+ decrypt_cycle(6);
+ decrypt_cycle(5);
+ decrypt_cycle(4);
+ decrypt_cycle(3);
+ decrypt_cycle(2);
+ decrypt_cycle(1);
+ decrypt_cycle_last(0);
+
+ vmovdqu (w)(CTX), RK1;
+
+ popq %rbx;
+ popq %r13;
+
+ outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+
+ ret;
+ENDPROC(__twofish_dec_blk8)
+
+ENTRY(twofish_ecb_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __twofish_enc_blk8;
+
+ store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_ecb_enc_8way)
+
+ENTRY(twofish_ecb_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_ecb_dec_8way)
+
+ENTRY(twofish_cbc_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r12;
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_cbc_dec_8way)
+
+ENTRY(twofish_ctr_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RX0, RX1, RY0);
+
+ call __twofish_enc_blk8;
+
+ store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ popq %r12;
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_ctr_8way)
+
+ENTRY(twofish_xts_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+ call __twofish_enc_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_xts_enc_8way)
+
+ENTRY(twofish_xts_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
+ RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+ call __twofish_dec_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_xts_dec_8way)
diff --git a/marvell/linux/arch/x86/crypto/twofish-i586-asm_32.S b/marvell/linux/arch/x86/crypto/twofish-i586-asm_32.S
new file mode 100644
index 0000000..290cc4e
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish-i586-asm_32.S
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/***************************************************************************
+* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
+* *
+***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+/* return address at 0 */
+
+#define in_blk 12 /* input byte array address parameter*/
+#define out_blk 8 /* output byte array address parameter*/
+#define ctx 4 /* Twofish context structure */
+
+#define a_offset 0
+#define b_offset 4
+#define c_offset 8
+#define d_offset 12
+
+/* Structure of the crypto context struct*/
+
+#define s0 0 /* S0 Array 256 Words each */
+#define s1 1024 /* S1 Array */
+#define s2 2048 /* S2 Array */
+#define s3 3072 /* S3 Array */
+#define w 4096 /* 8 whitening keys (word) */
+#define k 4128 /* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0D %eax
+#define R0B %al
+#define R0H %ah
+
+#define R1D %ebx
+#define R1B %bl
+#define R1H %bh
+
+#define R2D %ecx
+#define R2B %cl
+#define R2H %ch
+
+#define R3D %edx
+#define R3B %dl
+#define R3H %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+ xor w+offset(context), src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+ xor w+16+offset(context), src;
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+ push d ## D;\
+ movzx b ## B, %edi;\
+ mov s1(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ mov s2(%ebp,%edi,4),%esi;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%ebp,%edi,4),d ## D;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%ebp,%edi,4),%esi;\
+ movzx b ## B, %edi;\
+ xor s3(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx b ## H, %edi;\
+ ror $15, b ## D;\
+ xor (%ebp,%edi,4), d ## D;\
+ movzx a ## H, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add d ## D, %esi;\
+ add %esi, d ## D;\
+ add k+round(%ebp), %esi;\
+ xor %esi, c ## D;\
+ rol $15, c ## D;\
+ add k+4+round(%ebp),d ## D;\
+ xor %edi, d ## D;
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+ push d ## D;\
+ movzx b ## B, %edi;\
+ mov s1(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ mov s2(%ebp,%edi,4),%esi;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%ebp,%edi,4),d ## D;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%ebp,%edi,4),%esi;\
+ movzx b ## B, %edi;\
+ xor s3(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%ebp,%edi,4), d ## D;\
+ movzx a ## H, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add d ## D, %esi;\
+ add %esi, d ## D;\
+ add k+round(%ebp), %esi;\
+ xor %esi, c ## D;\
+ ror $1, c ## D;\
+ add k+4+round(%ebp),d ## D;\
+ xor %edi, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+ push c ## D;\
+ movzx a ## B, %edi;\
+ mov (%ebp,%edi,4), c ## D;\
+ movzx b ## B, %edi;\
+ mov s3(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx a ## B, %edi;\
+ xor s2(%ebp,%edi,4),c ## D;\
+ movzx b ## B, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $15, a ## D;\
+ xor s3(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ xor s2(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add %esi, c ## D;\
+ add c ## D, %esi;\
+ add k+round(%ebp), c ## D;\
+ xor %edi, c ## D;\
+ add k+4+round(%ebp),%esi;\
+ xor %esi, d ## D;\
+ rol $15, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+ push c ## D;\
+ movzx a ## B, %edi;\
+ mov (%ebp,%edi,4), c ## D;\
+ movzx b ## B, %edi;\
+ mov s3(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx a ## B, %edi;\
+ xor s2(%ebp,%edi,4),c ## D;\
+ movzx b ## B, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ xor s2(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add %esi, c ## D;\
+ add c ## D, %esi;\
+ add k+round(%ebp), c ## D;\
+ xor %edi, c ## D;\
+ add k+4+round(%ebp),%esi;\
+ xor %esi, d ## D;\
+ ror $1, d ## D;
+
+ENTRY(twofish_enc_blk)
+ push %ebp /* save registers according to calling convention*/
+ push %ebx
+ push %esi
+ push %edi
+
+ mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
+ * pointer to the ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
+
+ mov (%edi), %eax
+ mov b_offset(%edi), %ebx
+ mov c_offset(%edi), %ecx
+ mov d_offset(%edi), %edx
+ input_whitening(%eax,%ebp,a_offset)
+ ror $16, %eax
+ input_whitening(%ebx,%ebp,b_offset)
+ input_whitening(%ecx,%ebp,c_offset)
+ input_whitening(%edx,%ebp,d_offset)
+ rol $1, %edx
+
+ encrypt_round(R0,R1,R2,R3,0);
+ encrypt_round(R2,R3,R0,R1,8);
+ encrypt_round(R0,R1,R2,R3,2*8);
+ encrypt_round(R2,R3,R0,R1,3*8);
+ encrypt_round(R0,R1,R2,R3,4*8);
+ encrypt_round(R2,R3,R0,R1,5*8);
+ encrypt_round(R0,R1,R2,R3,6*8);
+ encrypt_round(R2,R3,R0,R1,7*8);
+ encrypt_round(R0,R1,R2,R3,8*8);
+ encrypt_round(R2,R3,R0,R1,9*8);
+ encrypt_round(R0,R1,R2,R3,10*8);
+ encrypt_round(R2,R3,R0,R1,11*8);
+ encrypt_round(R0,R1,R2,R3,12*8);
+ encrypt_round(R2,R3,R0,R1,13*8);
+ encrypt_round(R0,R1,R2,R3,14*8);
+ encrypt_last_round(R2,R3,R0,R1,15*8);
+
+ output_whitening(%eax,%ebp,c_offset)
+ output_whitening(%ebx,%ebp,d_offset)
+ output_whitening(%ecx,%ebp,a_offset)
+ output_whitening(%edx,%ebp,b_offset)
+ mov out_blk+16(%esp),%edi;
+ mov %eax, c_offset(%edi)
+ mov %ebx, d_offset(%edi)
+ mov %ecx, (%edi)
+ mov %edx, b_offset(%edi)
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ pop %ebp
+ mov $1, %eax
+ ret
+ENDPROC(twofish_enc_blk)
+
+ENTRY(twofish_dec_blk)
+ push %ebp /* save registers according to calling convention*/
+ push %ebx
+ push %esi
+ push %edi
+
+
+ mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
+ * pointer to the ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
+
+ mov (%edi), %eax
+ mov b_offset(%edi), %ebx
+ mov c_offset(%edi), %ecx
+ mov d_offset(%edi), %edx
+ output_whitening(%eax,%ebp,a_offset)
+ output_whitening(%ebx,%ebp,b_offset)
+ ror $16, %ebx
+ output_whitening(%ecx,%ebp,c_offset)
+ output_whitening(%edx,%ebp,d_offset)
+ rol $1, %ecx
+
+ decrypt_round(R0,R1,R2,R3,15*8);
+ decrypt_round(R2,R3,R0,R1,14*8);
+ decrypt_round(R0,R1,R2,R3,13*8);
+ decrypt_round(R2,R3,R0,R1,12*8);
+ decrypt_round(R0,R1,R2,R3,11*8);
+ decrypt_round(R2,R3,R0,R1,10*8);
+ decrypt_round(R0,R1,R2,R3,9*8);
+ decrypt_round(R2,R3,R0,R1,8*8);
+ decrypt_round(R0,R1,R2,R3,7*8);
+ decrypt_round(R2,R3,R0,R1,6*8);
+ decrypt_round(R0,R1,R2,R3,5*8);
+ decrypt_round(R2,R3,R0,R1,4*8);
+ decrypt_round(R0,R1,R2,R3,3*8);
+ decrypt_round(R2,R3,R0,R1,2*8);
+ decrypt_round(R0,R1,R2,R3,1*8);
+ decrypt_last_round(R2,R3,R0,R1,0);
+
+ input_whitening(%eax,%ebp,c_offset)
+ input_whitening(%ebx,%ebp,d_offset)
+ input_whitening(%ecx,%ebp,a_offset)
+ input_whitening(%edx,%ebp,b_offset)
+ mov out_blk+16(%esp),%edi;
+ mov %eax, c_offset(%edi)
+ mov %ebx, d_offset(%edi)
+ mov %ecx, (%edi)
+ mov %edx, b_offset(%edi)
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ pop %ebp
+ mov $1, %eax
+ ret
+ENDPROC(twofish_dec_blk)
diff --git a/marvell/linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/marvell/linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
new file mode 100644
index 0000000..e495e07
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Twofish Cipher 3-way parallel algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+
+.file "twofish-x86_64-asm-3way.S"
+.text
+
+/* structure of crypto context */
+#define s0 0
+#define s1 1024
+#define s2 2048
+#define s3 3072
+#define w 4096
+#define k 4128
+
+/**********************************************************************
+ 3-way twofish
+ **********************************************************************/
+#define CTX %rdi
+#define RIO %rdx
+
+#define RAB0 %rax
+#define RAB1 %rbx
+#define RAB2 %rcx
+
+#define RAB0d %eax
+#define RAB1d %ebx
+#define RAB2d %ecx
+
+#define RAB0bh %ah
+#define RAB1bh %bh
+#define RAB2bh %ch
+
+#define RAB0bl %al
+#define RAB1bl %bl
+#define RAB2bl %cl
+
+#define CD0 0x0(%rsp)
+#define CD1 0x8(%rsp)
+#define CD2 0x10(%rsp)
+
+# used only before/after all rounds
+#define RCD0 %r8
+#define RCD1 %r9
+#define RCD2 %r10
+
+# used only during rounds
+#define RX0 %r8
+#define RX1 %r9
+#define RX2 %r10
+
+#define RX0d %r8d
+#define RX1d %r9d
+#define RX2d %r10d
+
+#define RY0 %r11
+#define RY1 %r12
+#define RY2 %r13
+
+#define RY0d %r11d
+#define RY1d %r12d
+#define RY2d %r13d
+
+#define RT0 %rdx
+#define RT1 %rsi
+
+#define RT0d %edx
+#define RT1d %esi
+
+#define RT1bl %sil
+
+#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
+ movzbl ab ## bl, tmp2 ## d; \
+ movzbl ab ## bh, tmp1 ## d; \
+ rorq $(rot), ab; \
+ op1##l T0(CTX, tmp2, 4), dst ## d; \
+ op2##l T1(CTX, tmp1, 4), dst ## d;
+
+#define swap_ab_with_cd(ab, cd, tmp) \
+ movq cd, tmp; \
+ movq ab, cd; \
+ movq tmp, ab;
+
+/*
+ * Combined G1 & G2 function. Reordered with help of rotates to have moves
+ * at begining.
+ */
+#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
+ /* G1,1 && G2,1 */ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
+ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
+ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
+ \
+ /* G1,2 && G2,2 */ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
+ swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
+ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
+ swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
+ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
+ swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
+
+#define enc_round_end(ab, x, y, n) \
+ addl y ## d, x ## d; \
+ addl x ## d, y ## d; \
+ addl k+4*(2*(n))(CTX), x ## d; \
+ xorl ab ## d, x ## d; \
+ addl k+4*(2*(n)+1)(CTX), y ## d; \
+ shrq $32, ab; \
+ roll $1, ab ## d; \
+ xorl y ## d, ab ## d; \
+ shlq $32, ab; \
+ rorl $1, x ## d; \
+ orq x, ab;
+
+#define dec_round_end(ba, x, y, n) \
+ addl y ## d, x ## d; \
+ addl x ## d, y ## d; \
+ addl k+4*(2*(n))(CTX), x ## d; \
+ addl k+4*(2*(n)+1)(CTX), y ## d; \
+ xorl ba ## d, y ## d; \
+ shrq $32, ba; \
+ roll $1, ba ## d; \
+ xorl x ## d, ba ## d; \
+ shlq $32, ba; \
+ rorl $1, y ## d; \
+ orq y, ba;
+
+#define encrypt_round3(ab, cd, n) \
+ g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
+ \
+ enc_round_end(ab ## 0, RX0, RY0, n); \
+ enc_round_end(ab ## 1, RX1, RY1, n); \
+ enc_round_end(ab ## 2, RX2, RY2, n);
+
+#define decrypt_round3(ba, dc, n) \
+ g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
+ \
+ dec_round_end(ba ## 0, RX0, RY0, n); \
+ dec_round_end(ba ## 1, RX1, RY1, n); \
+ dec_round_end(ba ## 2, RX2, RY2, n);
+
+#define encrypt_cycle3(ab, cd, n) \
+ encrypt_round3(ab, cd, n*2); \
+ encrypt_round3(ab, cd, (n*2)+1);
+
+#define decrypt_cycle3(ba, dc, n) \
+ decrypt_round3(ba, dc, (n*2)+1); \
+ decrypt_round3(ba, dc, (n*2));
+
+#define push_cd() \
+ pushq RCD2; \
+ pushq RCD1; \
+ pushq RCD0;
+
+#define pop_cd() \
+ popq RCD0; \
+ popq RCD1; \
+ popq RCD2;
+
+#define inpack3(in, n, xy, m) \
+ movq 4*(n)(in), xy ## 0; \
+ xorq w+4*m(CTX), xy ## 0; \
+ \
+ movq 4*(4+(n))(in), xy ## 1; \
+ xorq w+4*m(CTX), xy ## 1; \
+ \
+ movq 4*(8+(n))(in), xy ## 2; \
+ xorq w+4*m(CTX), xy ## 2;
+
+#define outunpack3(op, out, n, xy, m) \
+ xorq w+4*m(CTX), xy ## 0; \
+ op ## q xy ## 0, 4*(n)(out); \
+ \
+ xorq w+4*m(CTX), xy ## 1; \
+ op ## q xy ## 1, 4*(4+(n))(out); \
+ \
+ xorq w+4*m(CTX), xy ## 2; \
+ op ## q xy ## 2, 4*(8+(n))(out);
+
+#define inpack_enc3() \
+ inpack3(RIO, 0, RAB, 0); \
+ inpack3(RIO, 2, RCD, 2);
+
+#define outunpack_enc3(op) \
+ outunpack3(op, RIO, 2, RAB, 6); \
+ outunpack3(op, RIO, 0, RCD, 4);
+
+#define inpack_dec3() \
+ inpack3(RIO, 0, RAB, 4); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ rorq $32, RAB2; \
+ inpack3(RIO, 2, RCD, 6); \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ rorq $32, RCD2;
+
+#define outunpack_dec3() \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ rorq $32, RCD2; \
+ outunpack3(mov, RIO, 0, RCD, 0); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ rorq $32, RAB2; \
+ outunpack3(mov, RIO, 2, RAB, 2);
+
+ENTRY(__twofish_enc_blk_3way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src, RIO
+ * %rcx: bool, if true: xor output
+ */
+ pushq %r13;
+ pushq %r12;
+ pushq %rbx;
+
+ pushq %rcx; /* bool xor */
+ pushq %rsi; /* dst */
+
+ inpack_enc3();
+
+ push_cd();
+ encrypt_cycle3(RAB, CD, 0);
+ encrypt_cycle3(RAB, CD, 1);
+ encrypt_cycle3(RAB, CD, 2);
+ encrypt_cycle3(RAB, CD, 3);
+ encrypt_cycle3(RAB, CD, 4);
+ encrypt_cycle3(RAB, CD, 5);
+ encrypt_cycle3(RAB, CD, 6);
+ encrypt_cycle3(RAB, CD, 7);
+ pop_cd();
+
+ popq RIO; /* dst */
+ popq RT1; /* bool xor */
+
+ testb RT1bl, RT1bl;
+ jnz .L__enc_xor3;
+
+ outunpack_enc3(mov);
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ ret;
+
+.L__enc_xor3:
+ outunpack_enc3(xor);
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ ret;
+ENDPROC(__twofish_enc_blk_3way)
+
+ENTRY(twofish_dec_blk_3way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src, RIO
+ */
+ pushq %r13;
+ pushq %r12;
+ pushq %rbx;
+
+ pushq %rsi; /* dst */
+
+ inpack_dec3();
+
+ push_cd();
+ decrypt_cycle3(RAB, CD, 7);
+ decrypt_cycle3(RAB, CD, 6);
+ decrypt_cycle3(RAB, CD, 5);
+ decrypt_cycle3(RAB, CD, 4);
+ decrypt_cycle3(RAB, CD, 3);
+ decrypt_cycle3(RAB, CD, 2);
+ decrypt_cycle3(RAB, CD, 1);
+ decrypt_cycle3(RAB, CD, 0);
+ pop_cd();
+
+ popq RIO; /* dst */
+
+ outunpack_dec3();
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ ret;
+ENDPROC(twofish_dec_blk_3way)
diff --git a/marvell/linux/arch/x86/crypto/twofish-x86_64-asm_64.S b/marvell/linux/arch/x86/crypto/twofish-x86_64-asm_64.S
new file mode 100644
index 0000000..ecef2cb
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -0,0 +1,308 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/***************************************************************************
+* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
+* *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+#define a_offset 0
+#define b_offset 4
+#define c_offset 8
+#define d_offset 12
+
+/* Structure of the crypto context struct*/
+
+#define s0 0 /* S0 Array 256 Words each */
+#define s1 1024 /* S1 Array */
+#define s2 2048 /* S2 Array */
+#define s3 3072 /* S3 Array */
+#define w 4096 /* 8 whitening keys (word) */
+#define k 4128 /* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0 %rax
+#define R0D %eax
+#define R0B %al
+#define R0H %ah
+
+#define R1 %rbx
+#define R1D %ebx
+#define R1B %bl
+#define R1H %bh
+
+#define R2 %rcx
+#define R2D %ecx
+#define R2B %cl
+#define R2H %ch
+
+#define R3 %rdx
+#define R3D %edx
+#define R3B %dl
+#define R3H %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+ xor w+offset(context), src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+ xor w+16+offset(context), src;
+
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+ movzx b ## B, %edi;\
+ mov s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ mov s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s3(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor (%r11,%rdi,4), %r9d;\
+ movzx b ## H, %edi;\
+ ror $15, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ rol $15, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;
+
+/*
+ * a input register containing a(rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+ mov b ## D, %r10d;\
+ shl $32, %r10;\
+ movzx b ## B, %edi;\
+ mov s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ mov s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s3(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor (%r11,%rdi,4), %r9d;\
+ xor a, %r10;\
+ movzx b ## H, %edi;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ ror $1, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+ movzx a ## B, %edi;\
+ mov (%r11,%rdi,4), %r9d;\
+ movzx b ## B, %edi;\
+ mov s3(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## B, %edi;\
+ xor s2(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s1(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $15, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;\
+ rol $15, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+ movzx a ## B, %edi;\
+ mov (%r11,%rdi,4), %r9d;\
+ movzx b ## B, %edi;\
+ mov s3(%r11,%rdi,4),%r8d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ mov b ## D, %r10d;\
+ shl $32, %r10;\
+ xor a, %r10;\
+ ror $16, a ## D;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;\
+ ror $1, d ## D;
+
+ENTRY(twofish_enc_blk)
+ pushq R1
+
+ /* %rdi contains the ctx address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ /* ctx address is moved to free one non-rex register
+ as target for the 8bit high operations */
+ mov %rdi, %r11
+
+ movq (R3), R1
+ movq 8(R3), R3
+ input_whitening(R1,%r11,a_offset)
+ input_whitening(R3,%r11,c_offset)
+ mov R1D, R0D
+ rol $16, R0D
+ shr $32, R1
+ mov R3D, R2D
+ shr $32, R3
+ rol $1, R3D
+
+ encrypt_round(R0,R1,R2,R3,0);
+ encrypt_round(R2,R3,R0,R1,8);
+ encrypt_round(R0,R1,R2,R3,2*8);
+ encrypt_round(R2,R3,R0,R1,3*8);
+ encrypt_round(R0,R1,R2,R3,4*8);
+ encrypt_round(R2,R3,R0,R1,5*8);
+ encrypt_round(R0,R1,R2,R3,6*8);
+ encrypt_round(R2,R3,R0,R1,7*8);
+ encrypt_round(R0,R1,R2,R3,8*8);
+ encrypt_round(R2,R3,R0,R1,9*8);
+ encrypt_round(R0,R1,R2,R3,10*8);
+ encrypt_round(R2,R3,R0,R1,11*8);
+ encrypt_round(R0,R1,R2,R3,12*8);
+ encrypt_round(R2,R3,R0,R1,13*8);
+ encrypt_round(R0,R1,R2,R3,14*8);
+ encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+ output_whitening(%r10,%r11,a_offset)
+ movq %r10, (%rsi)
+
+ shl $32, R1
+ xor R0, R1
+
+ output_whitening(R1,%r11,c_offset)
+ movq R1, 8(%rsi)
+
+ popq R1
+ movl $1,%eax
+ ret
+ENDPROC(twofish_enc_blk)
+
+ENTRY(twofish_dec_blk)
+ pushq R1
+
+ /* %rdi contains the ctx address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ /* ctx address is moved to free one non-rex register
+ as target for the 8bit high operations */
+ mov %rdi, %r11
+
+ movq (R3), R1
+ movq 8(R3), R3
+ output_whitening(R1,%r11,a_offset)
+ output_whitening(R3,%r11,c_offset)
+ mov R1D, R0D
+ shr $32, R1
+ rol $16, R1D
+ mov R3D, R2D
+ shr $32, R3
+ rol $1, R2D
+
+ decrypt_round(R0,R1,R2,R3,15*8);
+ decrypt_round(R2,R3,R0,R1,14*8);
+ decrypt_round(R0,R1,R2,R3,13*8);
+ decrypt_round(R2,R3,R0,R1,12*8);
+ decrypt_round(R0,R1,R2,R3,11*8);
+ decrypt_round(R2,R3,R0,R1,10*8);
+ decrypt_round(R0,R1,R2,R3,9*8);
+ decrypt_round(R2,R3,R0,R1,8*8);
+ decrypt_round(R0,R1,R2,R3,7*8);
+ decrypt_round(R2,R3,R0,R1,6*8);
+ decrypt_round(R0,R1,R2,R3,5*8);
+ decrypt_round(R2,R3,R0,R1,4*8);
+ decrypt_round(R0,R1,R2,R3,3*8);
+ decrypt_round(R2,R3,R0,R1,2*8);
+ decrypt_round(R0,R1,R2,R3,1*8);
+ decrypt_last_round(R2,R3,R0,R1,0);
+
+ input_whitening(%r10,%r11,a_offset)
+ movq %r10, (%rsi)
+
+ shl $32, R1
+ xor R0, R1
+
+ input_whitening(R1,%r11,c_offset)
+ movq R1, 8(%rsi)
+
+ popq R1
+ movl $1,%eax
+ ret
+ENDPROC(twofish_dec_blk)
diff --git a/marvell/linux/arch/x86/crypto/twofish_avx_glue.c b/marvell/linux/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 0000000..3b36e97
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for AVX assembler version of Twofish Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/twofish.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/twofish.h>
+
+#define TWOFISH_PARALLEL_BLOCKS 8
+
+/* 8-way parallel cipher functions */
+asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return twofish_setkey(&tfm->base, key, keylen);
+}
+
+static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk);
+}
+
+static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk);
+}
+
+struct twofish_xts_ctx {
+ struct twofish_ctx tweak_ctx;
+ struct twofish_ctx crypt_ctx;
+};
+
+static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *flags = &tfm->base.crt_flags;
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+
+static const struct common_glue_ctx twofish_enc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = twofish_ecb_enc_8way }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .ecb = twofish_enc_blk_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = twofish_enc_blk }
+ } }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = twofish_ctr_8way }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = twofish_enc_blk_ctr }
+ } }
+};
+
+static const struct common_glue_ctx twofish_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .xts = twofish_xts_enc_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = twofish_xts_enc }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = twofish_ecb_dec_8way }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .ecb = twofish_dec_blk_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = twofish_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = twofish_cbc_dec_8way }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = twofish_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .xts = twofish_xts_dec_8way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = twofish_xts_dec }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&twofish_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk,
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+}
+
+static struct skcipher_alg twofish_algs[] = {
+ {
+ .base.cra_name = "__ecb(twofish)",
+ .base.cra_driver_name = "__ecb-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(twofish)",
+ .base.cra_driver_name = "__cbc-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(twofish)",
+ .base.cra_driver_name = "__ctr-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .chunksize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(twofish)",
+ .base.cra_driver_name = "__xts-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * TF_MIN_KEY_SIZE,
+ .max_keysize = 2 * TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = xts_twofish_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)];
+
+static int __init twofish_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(twofish_algs,
+ ARRAY_SIZE(twofish_algs),
+ twofish_simd_algs);
+}
+
+static void __exit twofish_exit(void)
+{
+ simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs),
+ twofish_simd_algs);
+}
+
+module_init(twofish_init);
+module_exit(twofish_exit);
+
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("twofish");
diff --git a/marvell/linux/arch/x86/crypto/twofish_glue.c b/marvell/linux/arch/x86/crypto/twofish_glue.c
new file mode 100644
index 0000000..77e06c2
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish_glue.c
@@ -0,0 +1,100 @@
+/*
+ * Glue Code for assembler optimized version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_enc_blk);
+asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_dec_blk);
+
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "twofish",
+ .cra_driver_name = "twofish-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = TF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct twofish_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = TF_MIN_KEY_SIZE,
+ .cia_max_keysize = TF_MAX_KEY_SIZE,
+ .cia_setkey = twofish_setkey,
+ .cia_encrypt = twofish_encrypt,
+ .cia_decrypt = twofish_decrypt
+ }
+ }
+};
+
+static int __init init(void)
+{
+ return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/marvell/linux/arch/x86/crypto/twofish_glue_3way.c b/marvell/linux/arch/x86/crypto/twofish_glue_3way.c
new file mode 100644
index 0000000..768af60
--- /dev/null
+++ b/marvell/linux/arch/x86/crypto/twofish_glue_3way.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for 3-way parallel assembler optimized version of Twofish
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/twofish.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
+EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
+
+static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return twofish_setkey(&tfm->base, key, keylen);
+}
+
+static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, true);
+}
+
+void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s)
+{
+ u128 ivs[2];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ u128_xor(&dst[1], &dst[1], &ivs[0]);
+ u128_xor(&dst[2], &dst[2], &ivs[1]);
+}
+EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
+
+void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ if (dst != src)
+ *dst = *src;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, dst, (u128 *)&ctrblk);
+}
+EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
+
+void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
+{
+ be128 ctrblks[3];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
+
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ }
+
+ le128_to_be128(&ctrblks[0], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[1], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[2], iv);
+ le128_inc(iv);
+
+ twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
+
+static const struct common_glue_ctx twofish_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .ecb = twofish_enc_blk_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = twofish_enc_blk }
+ } }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = twofish_enc_blk_ctr }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .ecb = twofish_dec_blk_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = twofish_dec_blk }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = twofish_dec_blk }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&twofish_ctr, req);
+}
+
+static struct skcipher_alg tf_skciphers[] = {
+ {
+ .base.cra_name = "ecb(twofish)",
+ .base.cra_driver_name = "ecb-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(twofish)",
+ .base.cra_driver_name = "cbc-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(twofish)",
+ .base.cra_driver_name = "ctr-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .chunksize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x06 &&
+ (boot_cpu_data.x86_model == 0x1c ||
+ boot_cpu_data.x86_model == 0x26 ||
+ boot_cpu_data.x86_model == 0x36)) {
+ /*
+ * On Atom, twofish-3way is slower than original assembler
+ * implementation. Twofish-3way trades off some performance in
+ * storing blocks in 64bit registers to allow three blocks to
+ * be processed parallel. Parallel operation then allows gaining
+ * more performance than was trade off, on out-of-order CPUs.
+ * However Atom does not benefit from this parallellism and
+ * should be blacklisted.
+ */
+ return true;
+ }
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, twofish-3way is slower than original assembler
+ * implementation because excessive uses of 64bit rotate and
+ * left-shifts (which are really slow on P4) needed to store and
+ * handle 128bit block in two 64bit registers.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init init(void)
+{
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "twofish-x86_64-3way: performance on this CPU "
+ "would be suboptimal: disabling "
+ "twofish-x86_64-3way.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(tf_skciphers,
+ ARRAY_SIZE(tf_skciphers));
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");