[Feature]add MT2731_MP2_MR2_SVN388 baseline version
Change-Id: Ief04314834b31e27effab435d3ca8ba33b499059
diff --git a/src/bsp/lk/lib/sha256/arch/arm64/sha256-armv8.S b/src/bsp/lk/lib/sha256/arch/arm64/sha256-armv8.S
new file mode 100644
index 0000000..28be18a
--- /dev/null
+++ b/src/bsp/lk/lib/sha256/arch/arm64/sha256-armv8.S
@@ -0,0 +1,198 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+ * project. The module is, however, dual licensed under OpenSSL and
+ * CRYPTOGAMS licenses depending on where you obtain it. For further
+ * details see http://www.openssl.org/~appro/cryptogams/.
+ * ====================================================================
+ *
+ * SHA256/512 for ARMv8.
+ *
+ * Performance in cycles per processed byte and improvement coefficient
+ * over code generated with "default" compiler:
+ *
+ * SHA256-hw SHA256(*) SHA512
+ * Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+ * Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
+ * Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+ *
+ * (*) Software SHA256 results are of lesser relevance, presented
+ * mostly for informational purposes.
+ * (**) The result is a trade-off: it's possible to improve it by
+ * 10% (or by 1 cycle per round), but at the cost of 20% loss
+ * on Cortex-A53 (or by 4 cycles per round).
+ * (***) Super-impressive coefficients over gcc-generated code are
+ * indication of some compiler "pathology", most notably code
+ * generated with -mgeneral-regs-only is significanty faster
+ * and lags behind assembly only by 50-90%.
+ */
+
+.text
+.globl sha256_block_data_order
+.type sha256_block_data_order,%function
+.align 6
+.type .LK256,%object
+.LK256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0 //terminator
+.size .LK256,.-.LK256
+.align 3
+.LOPENSSL_armcap_P:
+.quad OPENSSL_armcap_P-.
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.type sha256_block_data_order,%function
+.align 6
+sha256_block_data_order:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adr x3,.LK256
+
+.Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_data_order,.-sha256_block_data_order
+.comm OPENSSL_armcap_P,4,4