/*
void v_convolve(
    Word16 x[],        // (i)     : input vector
    Word16 h[],        // (i)     : impulse response
    Word16 y[],        // (o)     : output vector
)


*/
#include "oscl_base_macros.h"
#if (PV_CPU_ARCH_VERSION >= 6)
	.text
	.align	5
	.word	0
	.global Convolve_asm
	.type   Convolve_asm, function

#ֲƫ
.equ	S_A, 0 @s_5s_8
.equ	NUM_CNT, 16
.equ	P_X, 20
.equ	P_X_START, 24
.equ	P_H, 28
.equ	P_Y, 32
.equ	P_LAST, 36 @һμֵҪõ
.equ	LAST, 40 @160BYTES
.equ	X_A, 200 @160BYTES
.equ	LOC_SIZE, 368


s_1    .req  r0
s_2    .req  r1
s_3    .req  r2
s_4    .req  r3
x_1    .req  r4
x_2    .req  r5
x_3    .req  r6
x_4    .req  r7
h_1    .req  r8
h_2    .req  r9
h_3    .req  r10
h_4    .req  r11

p_h      .req r0
p_y      .req r4
p_last   .req r12
num_cnt  .req r14
p_x      .req r14
w_1    .req  r5
w_2    .req  r6

	.macro LOAD reg, off
				LDR     \reg, [sp, #\off]
	.endm

	.macro STORE reg, off
				STR     \reg, [sp, #\off]
	.endm

	.macro Write n1, n2, n3, n4
				ASR     \n1, \n1, #12
				ASR     \n3, \n3, #12
				PKHBT   w_1, \n1, \n2, LSL #4
				PKHBT   w_2, \n3, \n4, LSL #4				
				STM     p_y!, {w_1, w_2}
	.endm

	.macro Copy10
				LDM     r0!, {r3-r7}
				PKHBT   r8, r14, r3, LSL #16
				PKHTB   r9, r3, r4
				PKHTB   r10, r4,  r5
				PKHTB   r11, r5,  r6
				PKHTB   r12, r6,  r7
				ROR     r9,  r9, #16
				ROR     r10, r10, #16				
				ROR     r11, r11, #16
				ROR     r12, r12, #16								
				STM     r1!, {r8-r12}	
	.endm

COPY_X:
				ADD     r1, sp, #X_A
				
				LDRH     r14, [r0], #2
				Copy10

				MOV     r14, r7, ASR #16				
				Copy10
				
				MOV     r14, r7, ASR #16				
				Copy10
				
				MOV     r14, r7, ASR #16
				Copy10
				
				ADD     r0, sp, #X_A
				
				B  COPY_END

Convolve_asm:
        STMFD   sp!, {r4-r12, lr}
				SUB     sp, sp, #LOC_SIZE   @ֲ

#ʼ
				STORE  r1, P_H

				STORE  r2, P_Y

				MOV   num_cnt, #4
				ADD   num_cnt, num_cnt, LSL #16
				STORE num_cnt, NUM_CNT


@x4ֽڶƵֲ
				TST    r0, #3   @4ֽڶ
				BNE    COPY_X

COPY_END:
				ADD    p_x, r0, #76
				STORE  p_x, P_X
				STORE  p_x, P_X_START

@LAST0
				ADD     r14, sp, #LAST

				MOV   r3, #0
				MOV   r4, #0
				MOV   r5, #0
				MOV   r6, #0
				MOV   r7, #0
				MOV   r8, #0																
				MOV   r9, #0																
				MOV   r10, #0																
				MOV   r11, #0																
				MOV   r12, #0																

				STM  r14!, {r3-r12}
				STM  r14!, {r3-r12}				
				STM  r14!, {r3-r12}
				STM  r14,  {r3-r12} @40

				
#ѭʼ				


LINE_BEGIN:			@һѭ	
/*
һʼΣ磺
32	33	34	35	36	37	38	39
33	34	35	36	37	38	39	
34	35	36	37	38	39		
35	36	37	38	39			
36	37	38	39				
37	38	39					
38	39						
39							
*/
				Load     p_h, P_H        @p_hs_1	 
				Load     p_x, P_X_START
				
				ADD      p_last, sp, #LAST
				
				LDM        p_h!, {h_1, h_2, h_3, h_4}
				STORE      p_h, P_H	     				
				
				LDMFA      p_x!, {x_1, x_2, x_3, x_4}     				
				LDM        p_last!, {s_1, s_2, s_3, s_4}		
								
				STORE     p_x, P_X
				STORE     p_x, P_X_START				
				
				
/* 㣺
								36	37	38	39
						36	37	38	39	
				36	37	38	39		
		36	37	38	39			
36	37	38	39				
37	38	39					
38	39						
39	
*/
				SMLABB  s_1, h_1, x_3, s_1 @h[0], x[36]
				SMLABT  s_2, h_1, x_3, s_2 @h[0], x[37]
				SMLABB  s_3, h_1, x_4, s_3 @h[0], x[38]
				SMLABT  s_4, h_1, x_4, s_4 @h[0], x[39]

				SMLATT  s_1, h_1, x_2, s_1 @h[1], x[35]
				SMLATB  s_2, h_1, x_3, s_2 @h[1], x[36]
				SMLATT  s_3, h_1, x_3, s_3 @h[1], x[37]
				SMLATB  s_4, h_1, x_4, s_4 @h[1], x[38]

				SMLABB  s_1, h_2, x_2, s_1 @h[2], x[34]
				SMLABT  s_2, h_2, x_2, s_2 @h[2], x[35]
				SMLABB  s_3, h_2, x_3, s_3 @h[2], x[36]
				SMLABT  s_4, h_2, x_3, s_4 @h[2], x[37]

				SMLATT  s_1, h_2, x_1, s_1 @h[3], x[33]
				SMLATB  s_2, h_2, x_2, s_2 @h[3], x[34]
				SMLATT  s_3, h_2, x_2, s_3 @h[3], x[35]
				SMLATB  s_4, h_2, x_3, s_4 @h[3], x[36]

				SMLABB  s_1, h_3, x_1, s_1 @h[4], x[32]
				SMLABT  s_2, h_3, x_1, s_2 @h[4], x[33]
				SMLABB  s_3, h_3, x_2, s_3 @h[4], x[34]
				SMLABT  s_4, h_3, x_2, s_4 @h[4], x[35]

				SMLATB  s_2, h_3, x_1, s_2 @h[5], x[32]
				SMLATT  s_3, h_3, x_1, s_3 @h[5], x[33]
				SMLATB  s_4, h_3, x_2, s_4 @h[5], x[34]

				SMLABB  s_3, h_4, x_1, s_3 @h[6], x[32]
				SMLABT  s_4, h_4, x_1, s_4 @h[6], x[33]								
	
				SMLATB  s_4, h_4, x_1, s_4 @h[7], x[32]

				STM     sp, {s_1, s_2, s_3}
				
				STR     s_4, [p_last, #-4]


/* 㣺
32	33	34	35
33	34	35
34	35
35				
*/				
				LDM      p_last, {s_1, s_2, s_3, s_4}		@read 4-7

				SMLABB  s_1, h_1, x_1, s_1 @h[0], x[32]
				SMLABT  s_2, h_1, x_1, s_2 @h[0], x[33]								
				SMLABB  s_3, h_1, x_2, s_3 @h[0], x[34]
				SMLABT  s_4, h_1, x_2, s_4 @h[0], x[35]

				SMLATB  s_2, h_1, x_1, s_2 @h[1], x[32]
				SMLATT  s_3, h_1, x_1, s_3 @h[1], x[33]				
				SMLATB  s_4, h_1, x_2, s_4 @h[1], x[34]
				
				LOAD    num_cnt, NUM_CNT

				SMLABB  s_3, h_2, x_1, s_3 @h[2], x[32]
				SMLABT  s_4, h_2, x_1, s_4 @h[2], x[33]												
				
				SMLATB  s_4, h_2, x_1, s_4 @h[3], x[32]		


				CMP     num_cnt, #0
				BEQ     END


#ѭ								
BLOCK_BEGIN:
				Load     p_x, P_X
				LDMFA    p_x!, {x_1, x_2, x_3, x_4}     				
				STORE    p_x, P_X


/*
һ4֣㣬һεs_1, s_2, s_3, s_4
									32
							32	33
					32	33	34
			32	33	34	35
		32	33	34	35	
	32	33	34	35		
32	33	34	35			

*/

				SMLATT  s_1, h_1, x_4, s_1 @h[1], x[31]

				SMLABT  s_2, h_2, x_4, s_2 @h[2], x[31]
				SMLABB  s_1, h_2, x_4, s_1 @h[2], x[30]

				SMLATT  s_3, h_2, x_4, s_3 @h[3], x[31]				
				SMLATB  s_2, h_2, x_4, s_2 @h[3], x[30]
				SMLATT  s_1, h_2, x_3, s_1 @h[3], x[29]

				SMLABT  s_4, h_3, x_4, s_4 @h[4], x[31]	
				SMLABB  s_3, h_3, x_4, s_3 @h[4], x[30]
				SMLABT  s_2, h_3, x_3, s_2 @h[4], x[29]				
				SMLABB  s_1, h_3, x_3, s_1 @h[4], x[28]

				SMLATB  s_4, h_3, x_4, s_4 @h[5], x[30]	
				SMLATT  s_3, h_3, x_3, s_3 @h[5], x[29]
				SMLATB  s_2, h_3, x_3, s_2 @h[5], x[28]				
				SMLATT  s_1, h_3, x_2, s_1 @h[5], x[27]

				SMLABT  s_4, h_4, x_3, s_4 @h[6], x[29]	
				SMLABB  s_3, h_4, x_3, s_3 @h[6], x[28]
				SMLABT  s_2, h_4, x_2, s_2 @h[6], x[27]
				SMLABB  s_1, h_4, x_2, s_1 @h[6], x[26]

				SMLATB  s_4, h_4, x_3, s_4 @h[7], x[28]	
				SMLATT  s_3, h_4, x_2, s_3 @h[7], x[27]
				SMLATB  s_2, h_4, x_2, s_2 @h[7], x[26]
				SMLATT  s_1, h_4, x_1, s_1 @h[7], x[25]

				STM     p_last, {s_1, s_2, s_3, s_4} @write 4-7

/*
				36
		36	37
36	37	38

*/

				LDM      sp, {s_1, s_2, s_3} 				

				SMLATT  s_1, h_3, x_4, s_1 @h[5], x[31]
				
				SMLABT  s_2, h_4, x_4, s_2 @h[6], x[31]								
				SMLABB  s_1, h_4, x_4, s_1 @h[6], x[30]				

				SMLATT  s_3, h_4, x_4, s_3 @h[7], x[31]
				SMLATB  s_2, h_4, x_4, s_2 @h[7], x[30]												
				SMLATT  s_1, h_4, x_3, s_1 @h[7], x[29]
	
				SUB     p_last, p_last, #16 @ָ0
				STM     p_last, {s_1, s_2, s_3} @write 0-2
				ADD     p_last, p_last, #32 @ָ8
								
												

/*
				28	29	30	31
			28	29	30	31	
		28	29	30	31		
	28	29	30	31			
28	29	30	31				
29	30	31					
30	31						
31							
*/
				LDM      p_last!, {s_1, s_2, s_3, s_4} @read 8-11

				SMLABB  s_1, h_1, x_3, s_1 @h[0], x[28]
				SMLABT  s_2, h_1, x_3, s_2 @h[0], x[29]																
				SMLABB  s_3, h_1, x_4, s_3 @h[0], x[30]							
				SMLABT  s_4, h_1, x_4, s_4 @h[0], x[31]			

				SMLATT  s_1, h_1, x_2, s_1 @h[1], x[27]
				SMLATB  s_2, h_1, x_3, s_2 @h[1], x[28]								
				SMLATT  s_3, h_1, x_3, s_3 @h[1], x[29]							
				SMLATB  s_4, h_1, x_4, s_4 @h[1], x[30]							
				
				SMLABB  s_1, h_2, x_2, s_1 @h[2], x[26]
				SMLABT  s_2, h_2, x_2, s_2 @h[2], x[27]				
				SMLABB  s_3, h_2, x_3, s_3 @h[2], x[28]							
				SMLABT  s_4, h_2, x_3, s_4 @h[2], x[29]								
				
				SMLATT  s_1, h_2, x_1, s_1 @h[3], x[25]
				SMLATB  s_2, h_2, x_2, s_2 @h[3], x[26]				
				SMLATT  s_3, h_2, x_2, s_3 @h[3], x[27]							
				SMLATB  s_4, h_2, x_3, s_4 @h[3], x[28]												
				
				SMLABB  s_1, h_3, x_1, s_1 @h[4], x[24]
				SMLABT  s_2, h_3, x_1, s_2 @h[4], x[25]				
				SMLABB  s_3, h_3, x_2, s_3 @h[4], x[26]							
				SMLABT  s_4, h_3, x_2, s_4 @h[4], x[27]										
				
				SMLATB  s_2, h_3, x_1, s_2 @h[5], x[24]
				SMLATT  s_3, h_3, x_1, s_3 @h[5], x[25]				
				SMLATB  s_4, h_3, x_2, s_4 @h[5], x[26]							
									
				SMLABB  s_3, h_4, x_1, s_3 @h[6], x[24]
				SMLABT  s_4, h_4, x_1, s_4 @h[6], x[25]				
	
				SMLATB  s_4, h_4, x_1, s_4 @h[7], x[24]

				STM     sp, {s_1, s_2, s_3}
				STR     s_4, [p_last, #-4] @write 11
				
/*
24	25	26	27
25	26	27	
26	27		
27			


*/
				LDM      p_last, {s_1, s_2, s_3, s_4} @read 12-15

				SMLABB  s_1, h_1, x_1, s_1 @h[0], x[24]
				SMLABT  s_2, h_1, x_1, s_2 @h[0], x[25]				
				SMLABB  s_3, h_1, x_2, s_3 @h[0], x[26]							
				SMLABT  s_4, h_1, x_2, s_4 @h[0], x[27]	

				SMLATB  s_2, h_1, x_1, s_2 @h[1], x[24]				
				SMLATT  s_3, h_1, x_1, s_3 @h[1], x[25]							
				SMLATB  s_4, h_1, x_2, s_4 @h[1], x[26]	

			  LOAD     num_cnt, NUM_CNT

				SMLABB  s_3, h_2, x_1, s_3 @h[2], x[24]							
				SMLABT  s_4, h_2, x_1, s_4 @h[2], x[25]	

				SMLATB  s_4, h_2, x_1, s_4 @h[3], x[24]	
				
	
#һѭ				
@¿

			  SUB      num_cnt, #1
			  STORE    num_cnt, NUM_CNT			
				TST      num_cnt, #0xff
				BNE      BLOCK_BEGIN

				
@
				LOAD     p_y, P_Y
				WRITE    s_1, s_2, s_3, s_4

				LDM      sp, {s_1, s_2, s_3}
				LDR      s_4, [p_last, #-4]

				WRITE    s_1, s_2, s_3, s_4

				STORE    p_y, P_Y				
				
@

				SUB    num_cnt, #0x10000
				ADD    num_cnt, num_cnt, LSR #16
				STORE  num_cnt, NUM_CNT
				B      LINE_BEGIN

END:
				LOAD     p_y, P_Y
				WRITE    s_1, s_2, s_3, s_4

				LDM      sp, {s_1, s_2, s_3}
				LDR      s_4, [p_last, #-4]
				WRITE    s_1, s_2, s_3, s_4			

				ADD     sp, sp, #LOC_SIZE   @ֲͷ
        LDMFD   sp!, {r4-r12, pc}
#endif
