blob: fedf779c01f9314081023b52d216c690939dba03 [file] [log] [blame]
/*******************************************************************************
* °æÈ¨ËùÓÐ (C)2016, ÖÐÐËͨѶ¹É·ÝÓÐÏÞ¹«Ë¾¡£
*
* ÎļþÃû³Æ: head.S
* Îļþ±êʶ: head.S
* ÄÚÈÝÕªÒª: °æ±¾½âѹÆô¶¯´úÂë
*
* ÐÞ¸ÄÈÕÆÚ °æ±¾ºÅ Ð޸ıê¼Ç ÐÞ¸ÄÈË ÐÞ¸ÄÄÚÈÝ
* ------------------------------------------------------------------------------
* 2016/09/12 V1.0 Create µËÄþˆÒ ´´½¨
*
*******************************************************************************/
/*******************************************************************************
* Í·Îļþ *
*******************************************************************************/
/*******************************************************************************
* ºê¶¨Òå *
*******************************************************************************/
#ifdef __thumb2__
#define ARM(x...)
#define THUMB(x...) x
#define W(instr) instr.w
#else
#define ARM(x...) x
#define THUMB(x...)
#define W(instr) instr
#endif
#define ARM_BE8(x...)
#define END(name) \
.size name, .-name
#define ENDPROC(name) \
.type name, %function; \
END(name)
#define CYGOPT_HAL_ARM_MMU
#define CONFIG_CPU_CP15
#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
#define CB_BITS 0x08
#else
#define CB_BITS 0x0c
#endif
/*******************************************************************************
* Íⲿº¯ÊýÉùÃ÷ *
*******************************************************************************/
.extern decompress_kernel
/*******************************************************************************
* Íⲿ±äÁ¿ÉùÃ÷ *
*******************************************************************************/
.extern image_start
/*******************************************************************************
* È«¾Öº¯ÊýʵÏÖ *
*******************************************************************************/
.section ".start", #alloc, #execinstr
.align
.arm @ Always enter in ARM state
.global _start
.type _start, function
_start:
.rept 8
mov r0, r0
.endr
.text
/* move to SVC MODE */
mrs r0, cpsr
bic r0, #0x1f
orr r0, r0, #0xd3
bic r0, #(1<<8) /* unmask Asynchronous abort */
msr cpsr_cxsf, r0
/* Control Register Setup */
mrc p15, 0, r0, c1, c0, 0
bic r0, r0, #(1<<0) /* MMU disabled */
orr r0, r0, #(1<<1) /* Alignment fault checking enabled */
bic r0, r0, #(1<<2) /* Data Cache disabled */
orr r0, r0, #(1<<11) /* Branch prediction enabled */
bic r0, r0, #(1<<12) /* Instruction Cache disabled */
bic r0, r0, #(1<<13) /* USE VBAR to set the vector base address */
DSB /* Ensure all previous loads/stores have completed */
mcr p15, 0, r0, c1, c0, 0
ISB
adr r0, LC0
ldmia r0, {r1, r2, r3, r6, r10, r11, r12}
ldr sp, [r0, #28]
/*
* We might be running at a different address. We need
* to fix up various pointers.
*/
sub r0, r0, r1 @ calculate the delta offset
add r2, r2, r0 @ __bss_start
add r3, r3, r0 @ __bss_end
add r6, r6, r0 @ _edata
add r10, r10, r0 @ inflated kernel size location
add r11, r11, r0 @ got_start
add r12, r12, r0 @ got_end
add sp, sp, r0 @ sp
/*¡¡½âѹºóÄÚºËµÄÆô¶¯µØÖ· */
ldr r4, =image_start
add r4, r4, r0
ldr r4, [r4]
/*
* The kernel build system appends the size of the
* decompressed kernel at the end of the compressed data
* in little-endian form.
*/
ldrb r9, [r10, #0]
ldrb lr, [r10, #1]
orr r9, r9, lr, lsl #8
ldrb lr, [r10, #2]
ldrb r10, [r10, #3]
orr r9, r9, lr, lsl #16
orr r9, r9, r10, lsl #24
add r10, r4, r9
adr r9, _clear_bss
cmp r10, r9
dead_loop:
bgt dead_loop
/*
* Relocate all entries in the GOT table.
* Bump bss entries to _edata + dtb size
*/
mov r5, #0
1:
ldr r1, [r11, #0] @ relocate entries in the GOT
add r1, r1, r0 @ This fixes up C references
cmp r1, r2 @ if entry >= bss_start &&
cmphs r3, r1 @ bss_end > entry
addhi r1, r1, r5 @ entry += dtb size
str r1, [r11], #4 @ next entry
cmp r11, r12
blo 1b
/* bump our bss pointers too */
add r2, r2, r5
add r3, r3, r5
/*
* BSS¶ÎÇåÁã
*/
mov r0, #0
_clear_bss:
str r0, [r2], #4
cmp r3, r2
bhi _clear_bss
bl cache_on
/*
* decompress kernel
*/
mov r0, r4
mov r1, sp @ malloc space above stack
add r2, sp, #0x10000 @ 64k max
bl decompress_kernel
bl cache_clean_flush
bl cache_off
/*
* enter kernel
*/
mov r0, #0
bx r4
.align 2
.type LC0, #object
LC0:
.word LC0 @ r1
.word __bss_start @ r2
.word __bss_end @ r3
.word _edata @ r6
.word input_data_end - 4 @ r10 (inflated size location)
.word _got_start @ r11
.word _got_end @ ip
.word .L_user_stack_end @ sp
.size LC0, . - LC0
__setup_mmu:
lsr r3, r4, #0x14
lsl r3, r3, #0x14
sub r3, r3, #16384 @ Page directory size
bic r3, r3, #0xff @ Align the pointer
bic r3, r3, #0x3f00
/*
* Initialise the page tables, turning on the cacheable and bufferable
* bits for the RAM area only.
*/
mov r0, r3
mov r9, r0, lsr #18
mov r9, r9, lsl #18 @ start of RAM
add r10, r9, #0x10000000 @ a reasonable RAM size
mov r1, #0x12 @ XN|U + section mapping
orr r1, r1, #3 << 10 @ AP=11
add r2, r3, #16384
1: cmp r1, r9 @ if virt > start of RAM
cmphs r10, r1 @ && end of RAM > virt
bic r1, r1, #0x1c @ clear XN|U + C + B
orrlo r1, r1, #0x10 @ Set XN|U for non-RAM
orrhs r1, r1, r6 @ set RAM section settings
str r1, [r0], #4 @ 1:1 mapping
add r1, r1, #1048576
teq r0, r2
bne 1b
/*
* If ever we are running from Flash, then we surely want the cache
* to be enabled also for our execution instance... We map 2MB of it
* so there is no map overlap problem for up to 1 MB compressed kernel.
* If the execution is in RAM then we would only be duplicating the above.
*/
orr r1, r6, #0x04 @ ensure B is set for this
orr r1, r1, #3 << 10
mov r2, pc
mov r2, r2, lsr #20
orr r1, r1, r2, lsl #20
add r0, r3, r2, lsl #2
str r1, [r0], #4
add r1, r1, #1048576
str r1, [r0]
mov pc, lr
ENDPROC(__setup_mmu)
__armv7_mmu_cache_on:
mov r12, lr
#ifdef CYGOPT_HAL_ARM_MMU
mrc p15, 0, r11, c0, c1, 4 @ read ID_MMFR0
tst r11, #0xf @ VMSA
movne r6, #CB_BITS | 0x02 @ !XN
blne __setup_mmu
mov r0, #0
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
tst r11, #0xf @ VMSA
mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
#endif
mrc p15, 0, r0, c1, c0, 0 @ read control reg
bic r0, r0, #1 << 28 @ clear SCTLR.TRE
orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement
orr r0, r0, #0x003c @ write buffer
bic r0, r0, #2 @ A (no unaligned access fault)
orr r0, r0, #1 << 22 @ U (v6 unaligned access model)
@ (needed for ARM1176)
#ifdef CYGOPT_HAL_ARM_MMU
ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables
mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg
orrne r0, r0, #1 @ MMU enabled
movne r1, #0xfffffffd @ domain 0 = client
bic r6, r6, #1 << 31 @ 32-bit translation system
bic r6, r6, #3 << 0 @ use only ttbr0
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
mcrne p15, 0, r6, c2, c0, 2 @ load ttb control
#endif
mcr p15, 0, r0, c7, c5, 4 @ ISB
mcr p15, 0, r0, c1, c0, 0 @ load control register
mrc p15, 0, r0, c1, c0, 0 @ and read it back
mov r0, #0
mcr p15, 0, r0, c7, c5, 4 @ ISB
mov pc, r12
#define PROC_ENTRY_SIZE (4*5)
/* cache on */
.align 5
cache_on: mov r3, #8 @ cache_on function
b call_cache_fn
call_cache_fn: adr r12, proc_types
#ifdef CONFIG_CPU_CP15
mrc p15, 0, r9, c0, c0 @ get processor ID
#elif defined(CONFIG_CPU_V7M)
/*
* On v7-M the processor id is located in the V7M_SCB_CPUID
* register, but as cache handling is IMPLEMENTATION DEFINED on
* v7-M (if existant at all) we just return early here.
* If V7M_SCB_CPUID were used the cpu ID functions (i.e.
* __armv7_mmu_cache_{on,off,flush}) would be selected which
* use cp15 registers that are not implemented on v7-M.
*/
bx lr
#else
ldr r9, =CONFIG_PROCESSOR_ID
#endif
1: ldr r1, [r12, #0] @ get value
ldr r2, [r12, #4] @ get mask
eor r1, r1, r9 @ (real ^ match)
tst r1, r2 @ & mask
ARM( addeq pc, r12, r3 ) @ call cache function
THUMB( addeq r12, r3 )
THUMB( moveq pc, r12 ) @ call cache function
add r12, r12, #PROC_ENTRY_SIZE
b 1b
/*
* Table for cache operations. This is basically:
* - CPU ID match
* - CPU ID mask
* - 'cache on' method instruction
* - 'cache off' method instruction
* - 'cache flush' method instruction
*
* We match an entry using: ((real_id ^ match) & mask) == 0
*
* Writethrough caches generally only need 'on' and 'off'
* methods. Writeback caches _must_ have the flush method
* defined.
*/
.align 2
.type proc_types,#object
proc_types:
.word 0x000f0000 @ new CPU Id
.word 0x000f0000
W(b) __armv7_mmu_cache_on
W(b) __armv7_mmu_cache_off
W(b) __armv7_mmu_cache_flush
.word 0 @ unrecognised type
.word 0
mov pc, lr
THUMB( nop )
mov pc, lr
THUMB( nop )
mov pc, lr
THUMB( nop )
.size proc_types, . - proc_types
/*
* If you get a "non-constant expression in ".if" statement"
* error from the assembler on this line, check that you have
* not accidentally written a "b" instruction where you should
* have written W(b).
*/
.if (. - proc_types) % PROC_ENTRY_SIZE != 0
.error "The size of one or more proc_types entries is wrong."
.endif
.align 5
cache_off: mov r3, #12 @ cache_off function
b call_cache_fn
__armv7_mmu_cache_off:
mrc p15, 0, r0, c1, c0
#ifdef CYGOPT_HAL_ARM_MMU
bic r0, r0, #0x000d
#else
bic r0, r0, #0x000c
#endif
mcr p15, 0, r0, c1, c0 @ turn MMU and cache off
mov r12, lr
bl __armv7_mmu_cache_flush
mov r0, #0
#ifdef CYGOPT_HAL_ARM_MMU
mcr p15, 0, r0, c8, c7, 0 @ invalidate whole TLB
#endif
mcr p15, 0, r0, c7, c5, 6 @ invalidate BTC
mcr p15, 0, r0, c7, c10, 4 @ DSB
mcr p15, 0, r0, c7, c5, 4 @ ISB
mov pc, r12
.align 5
cache_clean_flush:
mov r3, #16
b call_cache_fn
__armv7_mmu_cache_flush:
tst r4, #1
bne _iflush
mrc p15, 0, r10, c0, c1, 5 @ read ID_MMFR1
tst r10, #0xf << 16 @ hierarchical cache (ARMv7)
mov r10, #0
beq hierarchical
mcr p15, 0, r10, c7, c14, 0 @ clean+invalidate D
b _iflush
hierarchical:
mcr p15, 0, r10, c7, c10, 5 @ DMB
stmfd sp!, {r0-r7, r9-r11}
mrc p15, 1, r0, c0, c0, 1
ands r3, r0, #0x7000000
mov r3, r3, lsr #23
beq _finished
mov r10, #0
_loop1:
add r2, r10, r10, lsr #1
mov r1, r0, lsr r2
and r1, r1, #7
cmp r1, #2
blt _skip
mcr p15, 2, r10, c0, c0, 0
mcr p15, 0, r10, c7, c5, 4
mrc p15, 1, r1, c0, c0, 0
and r2, r1, #7
add r2, r2, #4
ldr r4, =0x3ff
ands r4, r4, r1, lsr #3
clz r5, r4
ldr r7, =0x7fff
ands r7, r7, r1, lsr #13
_loop2:
mov r9, r4
_loop3:
ARM( orr r11, r10, r9, lsl r5 )
ARM( orr r11, r11, r7, lsl r2 )
THUMB( lsl r6, r9, r5 )
THUMB( orr r11, r10, r6 )
THUMB( lsl r6, r7, r2 )
THUMB( orr r11, r11, r6 )
mcr p15, 0, r11, c7, c14, 2
subs r9, r9, #1
bge _loop3
subs r7, r7, #1
bge _loop2
_skip:
add r10, r10, #2
cmp r3, r10
bgt _loop1
_finished:
ldmfd sp!, {r0-r7, r9-r11}
mov r10, #0 @ swith back to cache level 0
mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr
_iflush:
mcr p15, 0, r10, c7, c10, 4 @ DSB
mcr p15, 0, r10, c7, c5, 0 @ invalidate I+BTB
mcr p15, 0, r10, c7, c10, 4 @ DSB
mcr p15, 0, r10, c7, c5, 4 @ ISB
mov pc, lr
.align
.section ".stack", "aw", %nobits
.L_user_stack: .space 4096
.L_user_stack_end: