//*void Convolve (
//*    Word16 x[],        /* (i)     : input vector                           */
//*    Word16 h[],        /* (i)     : impulse response                       */
//*    Word16 y[],        /* (o)     : output vector                          */
//*    Word16 L           /* (i)     : vector size                            */
//*)
//  r0 --- x[]
//  r1 --- h[]
//  r2 --- y[]
//  r3 --- L

#include "oscl_base_macros.h"
#include "gsm_amr_typedefs.h"

#if (PV_COMPILER == EPV_ARM_RVCT)

__asm void Convolve_opt_asm(
		Word16 x[],                           /* (i)     : input vector                              */
		Word16 h[],                           /* (i) Q15    : impulse response                       */
		Word16 y[],                           /* (o) 12 bits: output vector                          */
		Word16 L                              /* (i)     : vector size                               */
	     )
{
        PRESERVE8
            
#if PV_CPU_ARCH_VERSION >= 6
        STMFD     r13!, {r4 - r12, r14}
        MOV       r14, #1

//---------------		
// h3  h2  h1  h0
//             x0
//         x0  x1
//     x0  x1  x2
// x0  x1  x2  x3
//---------------
RollPre       
        ADD     r3, r1,   r14, LSL #3
        MOV     r5, r0                  // ʼxѭʼַ
        LDR     r4, [r3, #-4]!          // h[2], h[3]
        LDR     r6, [r5], #4            // x[0], x[1]
        LDR     r7, [r5], #4            // x[2], x[3]
        
        SMULTB  r12, r4, r6
        
        SMULBB  r11, r4, r6
        SMLABT  r12, r4, r6, r12
        
        LDR     r4,  [r3, #-4]!         // h[0], h[1]
        SMULTB  r10, r4, r6
        SMLATT  r11, r4, r6, r11
        SMLATB  r12, r4, r7, r12
        
        SMULBB  r9,  r4, r6
        SMLABT  r10, r4, r6, r10
        SMLABB  r11, r4, r7, r11
        SMLABT  r12, r4, r7, r12
        
        CMP     r3, r1
        BLE     Rolled               // XһԪأٹ
//---------------
// h3  h2  h1  h0
// x1  x2  x3  x4
// x2  x3  x4  x5
// x3  x4  x5  x6
// x4  x5  x6  x7
//---------------
        LDR     r4,  [r3, #-4]!          // h[2], h[3]
Rolling 
        LDR     r8,  [r5], #4
        
        SMLATT  r9,  r4, r6, r9
        SMLADX  r10, r4, r7, r10
        SMLATT  r11, r4, r7, r11
        SMLADX  r12, r4, r8, r12
        
        SMLABB  r9,  r4, r7, r9
        SMLABB  r11, r4, r8, r11
        
        MOV     r6, r7
        MOV     r7, r8	
        
        CMP     r3, r1
        LDRGT   r4,  [r3, #-4]!          
        BGT     Rolling 
Rolled 
        // Ѿ4еĹͣݺ
        LDR     r4, =0x8000
        ADD     r9,  r4, r9,  LSL #1
        ADD     r10, r4, r10, LSL #1
        ADD     r11, r4, r11, LSL #1
        ADD     r12, r4, r12, LSL #1
        MOV     r9,  r9,  ASR #16
        MOV     r10, r10, ASR #16
        MOV     r11, r11, ASR #16
        MOV     r12, r12, ASR #16
        STRH    r9,  [r2], #2
        STRH    r10, [r2], #2
        STRH    r11, [r2], #2
        STRH    r12, [r2], #2
        
        // ѭ
        CMP     r14, #16
        ADD     r14, r14, #1
        BLT     RollPre

Convolve_asm_end
        LDMFD   r13!, {r4 - r12, r15}
	
#else

        STMFD          r13!, {r4 - r12, r14}
        MOV            r3,  #0                           // n
        MOV            r11, #0x8000

LOOP
        ADD            r4, r1, r3, LSL #1                // tmpH address
        ADD            r5, r3, #1                        // i = n + 1
        MOV            r6, r0                            // tmpX = x
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        SUB            r5, r5, #1
        MUL            r8,  r9, r10

LOOP1
        CMP            r5, #0
        BLE            L1
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        MLA            r8, r12, r14, r8
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        SUBS           r5, r5, #4
        MLA            r8, r12, r14, r8
        
        B              LOOP1

L1

        ADD            r5, r11, r8, LSL #1
        MOV            r5, r5, LSR #16                   //extract_h(s)
        ADD            r3, r3, #1
        STRH           r5, [r2], #2                      //y[n]
        
        
        ADD            r4, r1, r3, LSL #1                //tmpH address
        ADD            r5, r3, #1
        MOV            r6, r0
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2
        LDRSH          r12, [r6], #2
        LDRSH          r14, [r4], #-2
        
        MUL            r8, r9, r10
        SUB            r5, r5, #2
        MLA            r8, r12, r14, r8

LOOP2
        CMP            r5, #0
        BLE            L2
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        MLA            r8, r12, r14, r8
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        SUBS           r5, r5, #4
        MLA            r8, r12, r14, r8
        B              LOOP2

L2
        ADD            r8, r11, r8, LSL #1
        MOV            r8, r8, LSR #16                   //extract_h(s)
        ADD            r3, r3, #1
        STRH           r8, [r2], #2                      //y[n]
        
        ADD            r4, r1, r3, LSL #1
        ADD            r5, r3, #1
        MOV            r6, r0
        LDRSH          r9,  [r6], #2
        LDRSH          r10, [r4], #-2
        LDRSH          r12, [r6], #2
        LDRSH          r14, [r4], #-2
        MUL            r8, r9, r10
        LDRSH          r9,  [r6], #2
        LDRSH          r10, [r4], #-2
        MLA            r8, r12, r14, r8
        SUB            r5, r5, #3
        MLA            r8, r9, r10, r8

LOOP3
        CMP            r5, #0
        BLE            L3
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        MLA            r8, r12, r14, r8
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        SUBS           r5, r5, #4
        MLA            r8, r12, r14, r8
        B              LOOP3

L3
        ADD            r8, r11, r8, LSL #1
        MOV            r8, r8, LSR #16                   //extract_h(s)
        ADD            r3, r3, #1
        STRH           r8, [r2], #2                      //y[n]
        
        ADD            r5, r3, #1                        // i = n + 1
        ADD            r4, r1, r3, LSL #1                // tmpH address
        MOV            r6, r0
        MOV            r8, #0

LOOP4
        CMP            r5, #0
        BLE            L4
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        MLA            r8, r12, r14, r8
        LDRSH          r9,  [r6], #2                     // *tmpX++
        LDRSH          r10, [r4], #-2                    // *tmpH--
        LDRSH          r12, [r6], #2                     // *tmpX++
        LDRSH          r14, [r4], #-2                    // *tmpH--
        MLA            r8, r9, r10, r8
        SUBS           r5, r5, #4
        MLA            r8, r12, r14, r8
        B              LOOP4
L4
        ADD            r5, r11, r8, LSL #1
        MOV            r5, r5, LSR #16                   //extract_h(s)
        ADD            r3, r3, #1
        STRH           r5, [r2], #2                      //y[n]
        
        CMP            r3, #64
        BLT            LOOP

Convolve_asm_end
        LDMFD      r13!, {r4 - r12, r15}
#endif
}
#endif

