[Feature]add MT2731_MP2_MR2_SVN388 baseline version
Change-Id: Ief04314834b31e27effab435d3ca8ba33b499059
diff --git a/src/bsp/lk/lib/rsa/arch/arm/armv4-mont.S b/src/bsp/lk/lib/rsa/arch/arm/armv4-mont.S
new file mode 100644
index 0000000..b6c388e
--- /dev/null
+++ b/src/bsp/lk/lib/rsa/arch/arm/armv4-mont.S
@@ -0,0 +1,589 @@
+#if defined(__arm__)
+#define __ARM_MAX_ARCH__ 8
+
+.text
+.code 32
+
+#if __ARM_MAX_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lbn_mul_mont
+#endif
+
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,%function
+
+.align 5
+bn_mul_mont:
+.Lbn_mul_mont:
+ ldr ip,[sp,#4] @ load num
+ stmdb sp!,{r0,r2} @ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+ tst ip,#7
+ bne .Lialu
+ adr r0,bn_mul_mont
+ ldr r2,.LOPENSSL_armcap
+ ldr r0,[r0,r2]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
+ tst r0,#1 @ NEON available?
+ ldmia sp, {r0,r2}
+ beq .Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+.Lialu:
+#endif
+ cmp ip,#2
+ mov r0,ip @ load num
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt .Labrt
+
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
+
+ mov r0,r0,lsl#2 @ rescale r0 for byte count
+ sub sp,sp,r0 @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub r0,r0,#4 @ "num=num-1"
+ add r4,r2,r0 @ &bp[num-1]
+
+ add r0,sp,r0 @ r0 to point at &tp[num-1]
+ ldr r8,[r0,#14*4] @ &n0
+ ldr r2,[r2] @ bp[0]
+ ldr r5,[r1],#4 @ ap[0],ap++
+ ldr r6,[r3],#4 @ np[0],np++
+ ldr r8,[r8] @ *n0
+ str r4,[r0,#15*4] @ save &bp[num]
+
+ umull r10,r11,r5,r2 @ ap[0]*bp[0]
+ str r8,[r0,#14*4] @ save n0 value
+ mul r8,r10,r8 @ "tp[0]"*n0
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
+ mov r4,sp
+
+.L1st:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ mov r10,r11
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[0]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne .L1st
+
+ adds r12,r12,r11
+ ldr r4,[r0,#13*4] @ restore bp
+ mov r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ str r14,[r0,#4] @ tp[num]=
+
+.Louter:
+ sub r7,r0,sp @ "original" r0-1 value
+ sub r1,r1,r7 @ "rewind" ap to &ap[1]
+ ldr r2,[r4,#4]! @ *(++bp)
+ sub r3,r3,r7 @ "rewind" np to &np[1]
+ ldr r5,[r1,#-4] @ ap[0]
+ ldr r10,[sp] @ tp[0]
+ ldr r6,[r3,#-4] @ np[0]
+ ldr r7,[sp,#4] @ tp[1]
+
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
+ str r4,[r0,#13*4] @ save bp
+ mul r8,r10,r8
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
+ mov r4,sp
+
+.Linner:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ adds r10,r11,r7 @ +=tp[j]
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[i]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adc r11,r11,#0
+ ldr r7,[r4,#8] @ tp[j+1]
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne .Linner
+
+ adds r12,r12,r11
+ mov r14,#0
+ ldr r4,[r0,#13*4] @ restore bp
+ adc r14,r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adds r12,r12,r7
+ ldr r7,[r0,#15*4] @ restore &bp[num]
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ str r14,[r0,#4] @ tp[num]=
+
+ cmp r4,r7
+ bne .Louter
+
+ ldr r2,[r0,#12*4] @ pull rp
+ add r0,r0,#4 @ r0 to point at &tp[num]
+ sub r5,r0,sp @ "original" num value
+ mov r4,sp @ "rewind" r4
+ mov r1,r4 @ "borrow" r1
+ sub r3,r3,r5 @ "rewind" r3 to &np[0]
+
+ subs r7,r7,r7 @ "clear" carry flag
+.Lsub: ldr r7,[r4],#4
+ ldr r6,[r3],#4
+ sbcs r7,r7,r6 @ tp[j]-np[j]
+ str r7,[r2],#4 @ rp[j]=
+ teq r4,r0 @ preserve carry
+ bne .Lsub
+ sbcs r14,r14,#0 @ upmost carry
+ mov r4,sp @ "rewind" r4
+ sub r2,r2,r5 @ "rewind" r2
+
+ and r1,r4,r14
+ bic r3,r2,r14
+ orr r1,r1,r3 @ ap=borrow?tp:rp
+
+.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh
+ str sp,[r4],#4 @ zap tp
+ str r7,[r2],#4
+ cmp r4,r0
+ bne .Lcopy
+
+ add sp,r0,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ bx lr @ .word 0xe12fff1e
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size bn_mul_mont,.-bn_mul_mont
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load rest of parameter block
+
+ sub r7,sp,#16
+ vld1.32 {d28[0]}, [r2,:32]!
+ sub r7,r7,r5,lsl#4
+ vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-(
+ and r7,r7,#-64
+ vld1.32 {d30[0]}, [r4,:32]
+ mov sp,r7 @ alloca
+ veor d8,d8,d8
+ subs r8,r5,#8
+ vzip.16 d28,d8
+
+ vmull.u32 q6,d28,d0[0]
+ vmull.u32 q7,d28,d0[1]
+ vmull.u32 q8,d28,d1[0]
+ vshl.i64 d10,d13,#16
+ vmull.u32 q9,d28,d1[1]
+
+ vadd.u64 d10,d10,d12
+ veor d8,d8,d8
+ vmul.u32 d29,d10,d30
+
+ vmull.u32 q10,d28,d2[0]
+ vld1.32 {d4,d5,d6,d7}, [r3]!
+ vmull.u32 q11,d28,d2[1]
+ vmull.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmull.u32 q13,d28,d3[1]
+
+ bne .LNEON_1st
+
+ @ special case for num=8, everything is in register bank...
+
+ vmlal.u32 q6,d29,d4[0]
+ sub r9,r5,#1
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ vadd.u64 d12,d12,d10
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d10,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+
+ vadd.u64 d10,d10,d12
+ veor d8,d8,d8
+ subs r9,r9,#1
+ vmul.u32 d29,d10,d30
+
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 d12,d12,d10
+ mov r7,sp
+ vshr.u64 d10,d12,#16
+ mov r8,r5
+ vadd.u64 d13,d13,d10
+ add r6,sp,#16
+ vshr.u64 d10,d13,#16
+ vzip.16 d12,d13
+
+ b .LNEON_tail2
+
+.align 4
+.LNEON_1st:
+ vmlal.u32 q6,d29,d4[0]
+ vld1.32 {d0,d1,d2,d3}, [r1]!
+ vmlal.u32 q7,d29,d4[1]
+ subs r8,r8,#8
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vld1.32 {d4,d5}, [r3]!
+ vmlal.u32 q11,d29,d6[1]
+ vst1.64 {q6,q7}, [r7,:256]!
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vst1.64 {q8,q9}, [r7,:256]!
+
+ vmull.u32 q6,d28,d0[0]
+ vld1.32 {d6,d7}, [r3]!
+ vmull.u32 q7,d28,d0[1]
+ vst1.64 {q10,q11}, [r7,:256]!
+ vmull.u32 q8,d28,d1[0]
+ vmull.u32 q9,d28,d1[1]
+ vst1.64 {q12,q13}, [r7,:256]!
+
+ vmull.u32 q10,d28,d2[0]
+ vmull.u32 q11,d28,d2[1]
+ vmull.u32 q12,d28,d3[0]
+ vmull.u32 q13,d28,d3[1]
+
+ bne .LNEON_1st
+
+ vmlal.u32 q6,d29,d4[0]
+ add r6,sp,#16
+ vmlal.u32 q7,d29,d4[1]
+ sub r1,r1,r5,lsl#2 @ rewind r1
+ vmlal.u32 q8,d29,d5[0]
+ vld1.64 {q5}, [sp,:128]
+ vmlal.u32 q9,d29,d5[1]
+ sub r9,r5,#1
+
+ vmlal.u32 q10,d29,d6[0]
+ vst1.64 {q6,q7}, [r7,:256]!
+ vmlal.u32 q11,d29,d6[1]
+ vshr.u64 d10,d10,#16
+ vld1.64 {q6}, [r6, :128]!
+ vmlal.u32 q12,d29,d7[0]
+ vst1.64 {q8,q9}, [r7,:256]!
+ vmlal.u32 q13,d29,d7[1]
+
+ vst1.64 {q10,q11}, [r7,:256]!
+ vadd.u64 d10,d10,d11
+ veor q4,q4,q4
+ vst1.64 {q12,q13}, [r7,:256]!
+ vld1.64 {q7,q8}, [r6, :256]!
+ vst1.64 {q4}, [r7,:128]
+ vshr.u64 d10,d10,#16
+
+ b .LNEON_outer
+
+.align 4
+.LNEON_outer:
+ vld1.32 {d28[0]}, [r2,:32]!
+ sub r3,r3,r5,lsl#2 @ rewind r3
+ vld1.32 {d0,d1,d2,d3}, [r1]!
+ veor d8,d8,d8
+ mov r7,sp
+ vzip.16 d28,d8
+ sub r8,r5,#8
+ vadd.u64 d12,d12,d10
+
+ vmlal.u32 q6,d28,d0[0]
+ vld1.64 {q9,q10},[r6,:256]!
+ vmlal.u32 q7,d28,d0[1]
+ vmlal.u32 q8,d28,d1[0]
+ vld1.64 {q11,q12},[r6,:256]!
+ vmlal.u32 q9,d28,d1[1]
+
+ vshl.i64 d10,d13,#16
+ veor d8,d8,d8
+ vadd.u64 d10,d10,d12
+ vld1.64 {q13},[r6,:128]!
+ vmul.u32 d29,d10,d30
+
+ vmlal.u32 q10,d28,d2[0]
+ vld1.32 {d4,d5,d6,d7}, [r3]!
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+
+.LNEON_inner:
+ vmlal.u32 q6,d29,d4[0]
+ vld1.32 {d0,d1,d2,d3}, [r1]!
+ vmlal.u32 q7,d29,d4[1]
+ subs r8,r8,#8
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+ vst1.64 {q6,q7}, [r7,:256]!
+
+ vmlal.u32 q10,d29,d6[0]
+ vld1.64 {q6}, [r6, :128]!
+ vmlal.u32 q11,d29,d6[1]
+ vst1.64 {q8,q9}, [r7,:256]!
+ vmlal.u32 q12,d29,d7[0]
+ vld1.64 {q7,q8}, [r6, :256]!
+ vmlal.u32 q13,d29,d7[1]
+ vst1.64 {q10,q11}, [r7,:256]!
+
+ vmlal.u32 q6,d28,d0[0]
+ vld1.64 {q9,q10}, [r6, :256]!
+ vmlal.u32 q7,d28,d0[1]
+ vst1.64 {q12,q13}, [r7,:256]!
+ vmlal.u32 q8,d28,d1[0]
+ vld1.64 {q11,q12}, [r6, :256]!
+ vmlal.u32 q9,d28,d1[1]
+ vld1.32 {d4,d5,d6,d7}, [r3]!
+
+ vmlal.u32 q10,d28,d2[0]
+ vld1.64 {q13}, [r6, :128]!
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vmlal.u32 q13,d28,d3[1]
+
+ bne .LNEON_inner
+
+ vmlal.u32 q6,d29,d4[0]
+ add r6,sp,#16
+ vmlal.u32 q7,d29,d4[1]
+ sub r1,r1,r5,lsl#2 @ rewind r1
+ vmlal.u32 q8,d29,d5[0]
+ vld1.64 {q5}, [sp,:128]
+ vmlal.u32 q9,d29,d5[1]
+ subs r9,r9,#1
+
+ vmlal.u32 q10,d29,d6[0]
+ vst1.64 {q6,q7}, [r7,:256]!
+ vmlal.u32 q11,d29,d6[1]
+ vld1.64 {q6}, [r6, :128]!
+ vshr.u64 d10,d10,#16
+ vst1.64 {q8,q9}, [r7,:256]!
+ vmlal.u32 q12,d29,d7[0]
+ vld1.64 {q7,q8}, [r6, :256]!
+ vmlal.u32 q13,d29,d7[1]
+
+ vst1.64 {q10,q11}, [r7,:256]!
+ vadd.u64 d10,d10,d11
+ vst1.64 {q12,q13}, [r7,:256]!
+ vshr.u64 d10,d10,#16
+
+ bne .LNEON_outer
+
+ mov r7,sp
+ mov r8,r5
+
+.LNEON_tail:
+ vadd.u64 d12,d12,d10
+ vld1.64 {q9,q10}, [r6, :256]!
+ vshr.u64 d10,d12,#16
+ vadd.u64 d13,d13,d10
+ vld1.64 {q11,q12}, [r6, :256]!
+ vshr.u64 d10,d13,#16
+ vld1.64 {q13}, [r6, :128]!
+ vzip.16 d12,d13
+
+.LNEON_tail2:
+ vadd.u64 d14,d14,d10
+ vst1.32 {d12[0]}, [r7, :32]!
+ vshr.u64 d10,d14,#16
+ vadd.u64 d15,d15,d10
+ vshr.u64 d10,d15,#16
+ vzip.16 d14,d15
+
+ vadd.u64 d16,d16,d10
+ vst1.32 {d14[0]}, [r7, :32]!
+ vshr.u64 d10,d16,#16
+ vadd.u64 d17,d17,d10
+ vshr.u64 d10,d17,#16
+ vzip.16 d16,d17
+
+ vadd.u64 d18,d18,d10
+ vst1.32 {d16[0]}, [r7, :32]!
+ vshr.u64 d10,d18,#16
+ vadd.u64 d19,d19,d10
+ vshr.u64 d10,d19,#16
+ vzip.16 d18,d19
+
+ vadd.u64 d20,d20,d10
+ vst1.32 {d18[0]}, [r7, :32]!
+ vshr.u64 d10,d20,#16
+ vadd.u64 d21,d21,d10
+ vshr.u64 d10,d21,#16
+ vzip.16 d20,d21
+
+ vadd.u64 d22,d22,d10
+ vst1.32 {d20[0]}, [r7, :32]!
+ vshr.u64 d10,d22,#16
+ vadd.u64 d23,d23,d10
+ vshr.u64 d10,d23,#16
+ vzip.16 d22,d23
+
+ vadd.u64 d24,d24,d10
+ vst1.32 {d22[0]}, [r7, :32]!
+ vshr.u64 d10,d24,#16
+ vadd.u64 d25,d25,d10
+ vld1.64 {q6}, [r6, :128]!
+ vshr.u64 d10,d25,#16
+ vzip.16 d24,d25
+
+ vadd.u64 d26,d26,d10
+ vst1.32 {d24[0]}, [r7, :32]!
+ vshr.u64 d10,d26,#16
+ vadd.u64 d27,d27,d10
+ vld1.64 {q7,q8}, [r6, :256]!
+ vshr.u64 d10,d27,#16
+ vzip.16 d26,d27
+ subs r8,r8,#8
+ vst1.32 {d26[0]}, [r7, :32]!
+
+ bne .LNEON_tail
+
+ vst1.32 {d10[0]}, [r7, :32] @ top-most bit
+ sub r3,r3,r5,lsl#2 @ rewind r3
+ subs r1,sp,#0 @ clear carry flag
+ add r2,sp,r5,lsl#2
+
+.LNEON_sub:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r3!, {r8,r9,r10,r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne .LNEON_sub
+
+ ldr r10, [r1] @ load top-most bit
+ veor q0,q0,q0
+ sub r11,r2,sp @ this is num*4
+ veor q1,q1,q1
+ mov r1,sp
+ sub r0,r0,r11 @ rewind r0
+ mov r3,r2 @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r0, {r8,r9,r10,r11}
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ movcc r11,r7
+ ldmia r1, {r4,r5,r6,r7}
+ stmia r0!, {r8,r9,r10,r11}
+ sub r1,r1,#16
+ ldmia r0, {r8,r9,r10,r11}
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r1,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ movcc r11,r7
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne .LNEON_copy_n_zap
+
+ sub sp,ip,#96
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ bx lr @ .word 0xe12fff1e
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
\ No newline at end of file
diff --git a/src/bsp/lk/lib/rsa/arch/arm64/armv8-mont.S b/src/bsp/lk/lib/rsa/arch/arm64/armv8-mont.S
new file mode 100644
index 0000000..de1f394
--- /dev/null
+++ b/src/bsp/lk/lib/rsa/arch/arm64/armv8-mont.S
@@ -0,0 +1,1208 @@
+#if defined(__aarch64__)
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,%function
+.align 5
+bn_mul_mont:
+.size bn_mul_mont,.-bn_mul_mont
+.type __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+.Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,.Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
+ adcs x25,x25,x15
+ umulh x15,x9,x6
+ adcs x26,x26,x16
+ umulh x16,x10,x6
+ stp x19,x20,[x2],#8*2 // t[0..1]
+ adc x19,xzr,xzr // t[8]
+ adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
+ umulh x17,x11,x6
+ adcs x22,x22,x14
+ umulh x14,x12,x6
+ adcs x23,x23,x15
+ umulh x15,x13,x6
+ adcs x24,x24,x16
+ mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
+ adcs x25,x25,x17
+ mul x17,x9,x7
+ adcs x26,x26,x14
+ mul x14,x10,x7
+ adc x19,x19,x15
+
+ mul x15,x11,x7
+ adds x22,x22,x16
+ mul x16,x12,x7
+ adcs x23,x23,x17
+ mul x17,x13,x7
+ adcs x24,x24,x14
+ umulh x14,x8,x7 // hi(a[2..7]*a[1])
+ adcs x25,x25,x15
+ umulh x15,x9,x7
+ adcs x26,x26,x16
+ umulh x16,x10,x7
+ adcs x19,x19,x17
+ umulh x17,x11,x7
+ stp x21,x22,[x2],#8*2 // t[2..3]
+ adc x20,xzr,xzr // t[9]
+ adds x23,x23,x14
+ umulh x14,x12,x7
+ adcs x24,x24,x15
+ umulh x15,x13,x7
+ adcs x25,x25,x16
+ mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
+ adcs x26,x26,x17
+ mul x17,x10,x8
+ adcs x19,x19,x14
+ mul x14,x11,x8
+ adc x20,x20,x15
+
+ mul x15,x12,x8
+ adds x24,x24,x16
+ mul x16,x13,x8
+ adcs x25,x25,x17
+ umulh x17,x9,x8 // hi(a[3..7]*a[2])
+ adcs x26,x26,x14
+ umulh x14,x10,x8
+ adcs x19,x19,x15
+ umulh x15,x11,x8
+ adcs x20,x20,x16
+ umulh x16,x12,x8
+ stp x23,x24,[x2],#8*2 // t[4..5]
+ adc x21,xzr,xzr // t[10]
+ adds x25,x25,x17
+ umulh x17,x13,x8
+ adcs x26,x26,x14
+ mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
+ adcs x19,x19,x15
+ mul x15,x11,x9
+ adcs x20,x20,x16
+ mul x16,x12,x9
+ adc x21,x21,x17
+
+ mul x17,x13,x9
+ adds x26,x26,x14
+ umulh x14,x10,x9 // hi(a[4..7]*a[3])
+ adcs x19,x19,x15
+ umulh x15,x11,x9
+ adcs x20,x20,x16
+ umulh x16,x12,x9
+ adcs x21,x21,x17
+ umulh x17,x13,x9
+ stp x25,x26,[x2],#8*2 // t[6..7]
+ adc x22,xzr,xzr // t[11]
+ adds x19,x19,x14
+ mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
+ adcs x20,x20,x15
+ mul x15,x12,x10
+ adcs x21,x21,x16
+ mul x16,x13,x10
+ adc x22,x22,x17
+
+ umulh x17,x11,x10 // hi(a[5..7]*a[4])
+ adds x20,x20,x14
+ umulh x14,x12,x10
+ adcs x21,x21,x15
+ umulh x15,x13,x10
+ adcs x22,x22,x16
+ mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
+ adc x23,xzr,xzr // t[12]
+ adds x21,x21,x17
+ mul x17,x13,x11
+ adcs x22,x22,x14
+ umulh x14,x12,x11 // hi(a[6..7]*a[5])
+ adc x23,x23,x15
+
+ umulh x15,x13,x11
+ adds x22,x22,x16
+ mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
+ adcs x23,x23,x17
+ umulh x17,x13,x12 // hi(a[7]*a[6])
+ adc x24,xzr,xzr // t[13]
+ adds x23,x23,x14
+ sub x27,x3,x1 // done yet?
+ adc x24,x24,x15
+
+ adds x24,x24,x16
+ sub x14,x3,x5 // rewinded ap
+ adc x25,xzr,xzr // t[14]
+ add x25,x25,x17
+
+ cbz x27,.Lsqr8x_outer_break
+
+ mov x4,x6
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x0,x1
+ adcs x26,xzr,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved below
+ mov x27,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+.Lsqr8x_mul:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,.Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp x1,x3 // done yet?
+ b.eq .Lsqr8x_break
+
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ ldr x4,[x0,#-8*8]
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b .Lsqr8x_mul
+
+.align 4
+.Lsqr8x_break:
+ ldp x6,x7,[x0,#8*0]
+ add x1,x0,#8*8
+ ldp x8,x9,[x0,#8*2]
+ sub x14,x3,x1 // is it last iteration?
+ ldp x10,x11,[x0,#8*4]
+ sub x15,x2,x14
+ ldp x12,x13,[x0,#8*6]
+ cbz x14,.Lsqr8x_outer_loop
+
+ stp x19,x20,[x2,#8*0]
+ ldp x19,x20,[x15,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x15,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x15,#8*4]
+ stp x25,x26,[x2,#8*6]
+ mov x2,x15
+ ldp x25,x26,[x15,#8*6]
+ b .Lsqr8x_outer_loop
+
+.align 4
+.Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
+ ldp x15,x16,[sp,#8*1]
+ ldp x11,x13,[x14,#8*2]
+ add x1,x14,#8*4
+ ldp x17,x14,[sp,#8*3]
+
+ stp x19,x20,[x2,#8*0]
+ mul x19,x7,x7
+ stp x21,x22,[x2,#8*2]
+ umulh x7,x7,x7
+ stp x23,x24,[x2,#8*4]
+ mul x8,x9,x9
+ stp x25,x26,[x2,#8*6]
+ mov x2,sp
+ umulh x9,x9,x9
+ adds x20,x7,x15,lsl#1
+ extr x15,x16,x15,#63
+ sub x27,x5,#8*4
+
+.Lsqr4x_shift_n_add:
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ sub x27,x27,#8*4
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ ldp x7,x9,[x1],#8*2
+ umulh x11,x11,x11
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ extr x17,x14,x17,#63
+ stp x19,x20,[x2,#8*0]
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ stp x21,x22,[x2,#8*2]
+ adcs x24,x11,x14
+ ldp x17,x14,[x2,#8*7]
+ extr x15,x16,x15,#63
+ adcs x25,x12,x15
+ extr x16,x17,x16,#63
+ adcs x26,x13,x16
+ ldp x15,x16,[x2,#8*9]
+ mul x6,x7,x7
+ ldp x11,x13,[x1],#8*2
+ umulh x7,x7,x7
+ mul x8,x9,x9
+ umulh x9,x9,x9
+ stp x23,x24,[x2,#8*4]
+ extr x17,x14,x17,#63
+ stp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ adcs x19,x6,x17
+ extr x14,x15,x14,#63
+ adcs x20,x7,x14
+ ldp x17,x14,[x2,#8*3]
+ extr x15,x16,x15,#63
+ cbnz x27,.Lsqr4x_shift_n_add
+ ldp x1,x4,[x29,#104] // pull np and n0
+
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ umulh x11,x11,x11
+ stp x19,x20,[x2,#8*0]
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ stp x21,x22,[x2,#8*2]
+ extr x17,x14,x17,#63
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ ldp x19,x20,[sp,#8*0]
+ adcs x24,x11,x14
+ extr x15,x16,x15,#63
+ ldp x6,x7,[x1,#8*0]
+ adcs x25,x12,x15
+ extr x16,xzr,x16,#63
+ ldp x8,x9,[x1,#8*2]
+ adc x26,x13,x16
+ ldp x10,x11,[x1,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul x28,x4,x19 // t[0]*n0
+ ldp x12,x13,[x1,#8*6]
+ add x3,x1,x5
+ ldp x21,x22,[sp,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[sp,#8*4]
+ stp x25,x26,[x2,#8*6]
+ ldp x25,x26,[sp,#8*6]
+ add x1,x1,#8*8
+ mov x30,xzr // initial top-most carry
+ mov x2,sp
+ mov x27,#8
+
+.Lsqr8x_reduction:
+ // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
+ mul x15,x7,x28
+ sub x27,x27,#1
+ mul x16,x8,x28
+ str x28,[x2],#8 // put aside t[0]*n0 for tail processing
+ mul x17,x9,x28
+ // (*) adds xzr,x19,x14
+ subs xzr,x19,#1 // (*)
+ mul x14,x10,x28
+ adcs x19,x20,x15
+ mul x15,x11,x28
+ adcs x20,x21,x16
+ mul x16,x12,x28
+ adcs x21,x22,x17
+ mul x17,x13,x28
+ adcs x22,x23,x14
+ umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
+ adcs x23,x24,x15
+ umulh x15,x7,x28
+ adcs x24,x25,x16
+ umulh x16,x8,x28
+ adcs x25,x26,x17
+ umulh x17,x9,x28
+ adc x26,xzr,xzr
+ adds x19,x19,x14
+ umulh x14,x10,x28
+ adcs x20,x20,x15
+ umulh x15,x11,x28
+ adcs x21,x21,x16
+ umulh x16,x12,x28
+ adcs x22,x22,x17
+ umulh x17,x13,x28
+ mul x28,x4,x19 // next t[0]*n0
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adc x26,x26,x17
+ cbnz x27,.Lsqr8x_reduction
+
+ ldp x14,x15,[x2,#8*0]
+ ldp x16,x17,[x2,#8*2]
+ mov x0,x2
+ sub x27,x3,x1 // done yet?
+ adds x19,x19,x14
+ adcs x20,x20,x15
+ ldp x14,x15,[x2,#8*4]
+ adcs x21,x21,x16
+ adcs x22,x22,x17
+ ldp x16,x17,[x2,#8*6]
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adcs x26,x26,x17
+ //adc x28,xzr,xzr // moved below
+ cbz x27,.Lsqr8x8_post_condition
+
+ ldr x4,[x2,#-8*8]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ mov x27,#-8*8
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+
+.Lsqr8x_tail:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,.Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp x6,x7,[x2,#8*0]
+ sub x27,x3,x1 // done yet?
+ sub x16,x3,x5 // rewinded np
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ cbz x27,.Lsqr8x_tail_break
+
+ ldr x4,[x0,#-8*8]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b .Lsqr8x_tail
+
+.align 4
+.Lsqr8x_tail_break:
+ ldr x4,[x29,#112] // pull n0
+ add x27,x2,#8*8 // end of current t[num] window
+
+ subs xzr,x30,#1 // "move" top-most carry to carry bit
+ adcs x14,x19,x6
+ adcs x15,x20,x7
+ ldp x19,x20,[x0,#8*0]
+ adcs x21,x21,x8
+ ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
+ adcs x22,x22,x9
+ ldp x8,x9,[x16,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x16,#8*4]
+ adcs x25,x25,x12
+ adcs x26,x26,x13
+ ldp x12,x13,[x16,#8*6]
+ add x1,x16,#8*8
+ adc x30,xzr,xzr // top-most carry
+ mul x28,x4,x19
+ stp x14,x15,[x2,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x0,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x0,#8*4]
+ cmp x27,x29 // did we hit the bottom?
+ stp x25,x26,[x2,#8*6]
+ mov x2,x0 // slide the window
+ ldp x25,x26,[x0,#8*6]
+ mov x27,#8
+ b.ne .Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x0,[x29,#96] // pull rp
+ add x2,x2,#8*8
+ subs x14,x19,x6
+ sbcs x15,x20,x7
+ sub x27,x5,#8*8
+ mov x3,x0 // x0 copy
+
+.Lsqr8x_sub:
+ sbcs x16,x21,x8
+ ldp x6,x7,[x1,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x1,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x10,x11,[x1,#8*4]
+ sbcs x17,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ ldp x19,x20,[x2,#8*0]
+ sub x27,x27,#8*8
+ ldp x21,x22,[x2,#8*2]
+ ldp x23,x24,[x2,#8*4]
+ ldp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ stp x14,x15,[x0,#8*4]
+ sbcs x14,x19,x6
+ stp x16,x17,[x0,#8*6]
+ add x0,x0,#8*8
+ sbcs x15,x20,x7
+ cbnz x27,.Lsqr8x_sub
+
+ sbcs x16,x21,x8
+ mov x2,sp
+ add x1,sp,x5
+ ldp x6,x7,[x3,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x3,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x19,x20,[x1,#8*0]
+ sbcs x17,x26,x13
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp x14,x15,[x0,#8*4]
+ stp x16,x17,[x0,#8*6]
+
+ sub x27,x5,#8*4
+.Lsqr4x_cond_copy:
+ sub x27,x27,#8*4
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ ldp x6,x7,[x3,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x16,x21,x8,lo
+ stp xzr,xzr,[x2,#8*2]
+ add x2,x2,#8*4
+ csel x17,x22,x9,lo
+ ldp x8,x9,[x3,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ stp xzr,xzr,[x1,#8*0]
+ stp xzr,xzr,[x1,#8*2]
+ cbnz x27,.Lsqr4x_cond_copy
+
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ stp xzr,xzr,[x2,#8*2]
+ csel x16,x21,x8,lo
+ csel x17,x22,x9,lo
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+
+ b .Lsqr8x_done
+
+.align 4
+.Lsqr8x8_post_condition:
+ adc x28,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // x19-7,x28 hold result, x6-7 hold modulus
+ subs x6,x19,x6
+ ldr x1,[x29,#96] // pull rp
+ sbcs x7,x20,x7
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x8
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x9
+ stp xzr,xzr,[sp,#8*4]
+ sbcs x10,x23,x10
+ stp xzr,xzr,[sp,#8*6]
+ sbcs x11,x24,x11
+ stp xzr,xzr,[sp,#8*8]
+ sbcs x12,x25,x12
+ stp xzr,xzr,[sp,#8*10]
+ sbcs x13,x26,x13
+ stp xzr,xzr,[sp,#8*12]
+ sbcs x28,x28,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // x6-7 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ csel x10,x23,x10,lo
+ csel x11,x24,x11,lo
+ stp x8,x9,[x1,#8*2]
+ csel x12,x25,x12,lo
+ csel x13,x26,x13,lo
+ stp x10,x11,[x1,#8*4]
+ stp x12,x13,[x1,#8*6]
+
+.Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ ret
+.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+.type __bn_mul4x_mont,%function
+.align 5
+__bn_mul4x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub x26,sp,x5,lsl#3
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ sub sp,x26,#8*4 // alloca
+
+ add x10,x2,x5
+ add x27,x1,x5
+ stp x0,x10,[x29,#96] // offload rp and &b[num]
+
+ ldr x24,[x2,#8*0] // b[0]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x28,#0
+ mov x26,sp
+
+.Loop_mul4x_1st_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[0])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[0])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ sub x10,x27,x1
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_1st_reduction
+
+ cbz x10,.Lmul4x4_post_condition
+
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldr x25,[sp] // a[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.Loop_mul4x_1st_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[i])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[i])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ adcs x23,x23,x0
+ umulh x13,x17,x25
+ adc x0,xzr,xzr
+ ldr x25,[sp,x28] // next t[0]*n0
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_1st_tail
+
+ sub x11,x27,x5 // rewinded x1
+ cbz x10,.Lmul4x_proceed
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b .Loop_mul4x_1st_tail
+
+.align 5
+.Lmul4x_proceed:
+ ldr x24,[x2,#8*4]! // *++b
+ adc x30,x0,xzr
+ ldp x6,x7,[x11,#8*0] // a[0..3]
+ sub x3,x3,x5 // rewind np
+ ldp x8,x9,[x11,#8*2]
+ add x1,x11,#8*4
+
+ stp x19,x20,[x26,#8*0] // result!!!
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ stp x21,x22,[x26,#8*2] // result!!!
+ ldp x21,x22,[sp,#8*6]
+
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ mov x26,sp
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+
+.align 4
+.Loop_mul4x_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[4])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_reduction
+
+ adc x0,x0,xzr
+ ldp x10,x11,[x26,#8*4] // t[4..7]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+
+ ldr x25,[sp] // t[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.align 4
+.Loop_mul4x_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[4])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ umulh x13,x17,x25
+ adcs x23,x23,x0
+ ldr x25,[sp,x28] // next a[0]*n0
+ adc x0,xzr,xzr
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_tail
+
+ sub x11,x3,x5 // rewinded np?
+ adc x0,x0,xzr
+ cbz x10,.Loop_mul4x_break
+
+ ldp x10,x11,[x26,#8*4]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b .Loop_mul4x_tail
+
+.align 4
+.Loop_mul4x_break:
+ ldp x12,x13,[x29,#96] // pull rp and &b[num]
+ adds x19,x19,x30
+ add x2,x2,#8*4 // bp++
+ adcs x20,x20,xzr
+ sub x1,x1,x5 // rewind ap
+ adcs x21,x21,xzr
+ stp x19,x20,[x26,#8*0] // result!!!
+ adcs x22,x22,xzr
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ adc x30,x0,xzr
+ stp x21,x22,[x26,#8*2] // result!!!
+ cmp x2,x13 // done yet?
+ ldp x21,x22,[sp,#8*6]
+ ldp x14,x15,[x11,#8*0] // n[0..3]
+ ldp x16,x17,[x11,#8*2]
+ add x3,x11,#8*4
+ b.eq .Lmul4x_post
+
+ ldr x24,[x2]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ adds x1,x1,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x26,sp
+ b .Loop_mul4x_reduction
+
+.align 4
+.Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov x0,x12
+ mov x27,x12 // x0 copy
+ subs x10,x19,x14
+ add x26,sp,#8*8
+ sbcs x11,x20,x15
+ sub x28,x5,#8*4
+
+.Lmul4x_sub:
+ sbcs x12,x21,x16
+ ldp x14,x15,[x3,#8*0]
+ sub x28,x28,#8*4
+ ldp x19,x20,[x26,#8*0]
+ sbcs x13,x22,x17
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ ldp x21,x22,[x26,#8*2]
+ add x26,x26,#8*4
+ stp x10,x11,[x0,#8*0]
+ sbcs x10,x19,x14
+ stp x12,x13,[x0,#8*2]
+ add x0,x0,#8*4
+ sbcs x11,x20,x15
+ cbnz x28,.Lmul4x_sub
+
+ sbcs x12,x21,x16
+ mov x26,sp
+ add x1,sp,#8*4
+ ldp x6,x7,[x27,#8*0]
+ sbcs x13,x22,x17
+ stp x10,x11,[x0,#8*0]
+ ldp x8,x9,[x27,#8*2]
+ stp x12,x13,[x0,#8*2]
+ ldp x19,x20,[x1,#8*0]
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub x28,x5,#8*4
+.Lmul4x_cond_copy:
+ sub x28,x28,#8*4
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ ldp x6,x7,[x27,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*2]
+ add x26,x26,#8*4
+ csel x13,x22,x9,lo
+ ldp x8,x9,[x27,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+ add x27,x27,#8*4
+ cbnz x28,.Lmul4x_cond_copy
+
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ stp xzr,xzr,[x26,#8*2]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*3]
+ csel x13,x22,x9,lo
+ stp xzr,xzr,[x26,#8*4]
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+
+ b .Lmul4x_done
+
+.align 4
+.Lmul4x4_post_condition:
+ adc x0,x0,xzr
+ ldr x1,[x29,#96] // pull rp
+ // x19-3,x0 hold result, x14-7 hold modulus
+ subs x6,x19,x14
+ ldr x30,[x29,#8] // pull return address
+ sbcs x7,x20,x15
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x16
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x17
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,x0,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // x6-3 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ stp x8,x9,[x1,#8*2]
+
+.Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ ret
+.size __bn_mul4x_mont,.-__bn_mul4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 4
+#endif
\ No newline at end of file
diff --git a/src/bsp/lk/lib/rsa/rsa.c b/src/bsp/lk/lib/rsa/rsa.c
new file mode 100644
index 0000000..c28ff31
--- /dev/null
+++ b/src/bsp/lk/lib/rsa/rsa.c
@@ -0,0 +1,44 @@
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <rsa.h>
+#if defined(__aarch64__)
+typedef __uint128_t uint128_t;
+typedef uint128_t uintptr2_t;
+#elif defined(__arm__)
+typedef uint64_t uintptr2_t;
+#else
+#error Unsupport architecture
+#endif
+
+#define N_BYTES (2048 / 8)
+#define N_WORDS (N_BYTES / sizeof(uintptr_t))
+
+int bn_mul_mont(uintptr_t *rp, const uintptr_t *ap, const uintptr_t *bp,
+ const uintptr_t *np, const uint64_t *n0, int num);
+
+void mod_exp_65537_mont(uintptr_t *r, const uintptr_t *a, const struct key_prop *pkey)
+{
+ const uintptr_t *m, *rr;
+ uint64_t n0inv;
+ uint32_t n_words;
+ size_t i;
+
+ m=pkey->modulus;
+ rr=pkey->rr;
+ n0inv=pkey->n0inv;
+ n_words=(pkey->num_bits/8)/sizeof(uintptr_t);
+
+ uintptr_t ar[n_words];
+
+ bn_mul_mont(ar, a, rr, m, &n0inv, n_words);
+ bn_mul_mont(r, ar, ar, m, &n0inv, n_words);
+ for (i = 15; i != 0; i--)
+ bn_mul_mont(r, r, r, m, &n0inv, n_words);
+ bn_mul_mont(r, r, ar, m, &n0inv, n_words);
+
+ memset(ar, 0, sizeof(ar));
+ ar[0] = 1;
+ bn_mul_mont(r, r, ar, m, &n0inv, n_words);
+}
+
diff --git a/src/bsp/lk/lib/rsa/rules.mk b/src/bsp/lk/lib/rsa/rules.mk
new file mode 100644
index 0000000..f52d607
--- /dev/null
+++ b/src/bsp/lk/lib/rsa/rules.mk
@@ -0,0 +1,10 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+MODULE_SRCS += \
+ $(LOCAL_DIR)/arch/arm/armv4-mont.S \
+ $(LOCAL_DIR)/arch/arm64/armv8-mont.S \
+ $(LOCAL_DIR)/rsa.c
+
+include make/module.mk