diff --git a/core/exception_unaligned_load.S.inc b/core/exception_unaligned_load.S.inc deleted file mode 100644 index 03c9830..0000000 --- a/core/exception_unaligned_load.S.inc +++ /dev/null @@ -1,190 +0,0 @@ -/* Xtensa Exception unaligned load handler - - Completes l8/l16 load instructions from Instruction address space, - that the architecture require to be 4 byte aligned word reads. - - Called from either UserExceptionVector or DoubleExceptionVector - depending on where the exception happened. - - Fast path (no branches) is for l8ui. - - Part of esp-open-rtos - Copyright (C) Angus Gratton - BSD Licensed as described in the file LICENSE -*/ - .text - .section .vecbase.text, "x" - .literal_position - -/* "Fix" LoadStoreException exceptions that are l8/l16 from an Instruction region, - normal exception variant. */ -UserExceptionLoadStoreHandler: - addi sp, sp, -0x18 - s32i a2, sp, 0x08 - rsr.epc1 a2 -/* Inner UserLoadStoreExceptionHandler handlers. Works for both level1 & level 2 interrupt level. - * - * Called from level-specific handler above which sets up stack and loads epcX into a2. - */ -InnerLoadStoreExceptionHandler: - s32i a3, sp, 0x0c - s32i a4, sp, 0x10 - s32i a5, sp, 0x14 - rsr.sar a0 // save sar in a0 - - /* Examine the instruction we failed to execute (in a2) */ - ssa8l a2 // sar is now correct shift for aligned read - movi a3, ~3 - and a2, a2, a3 // a2 now 4-byte aligned address of instruction - l32i a3, a2, 0 - l32i a4, a2, 4 - src a2, a4, a3 // a2 now instruction that failed - - /* check for l8ui opcode 0x000002, or branch to check l16 */ - movi a3, 0x00700F /* opcode mask for l8ui/l16si/l16ui */ - and a3, a2, a3 - bnei a3, 0x000002, .Lcheck_fix_16bit - movi a5, 0xFF - -.Lcan_fix: - /* verified an 8- or 16-bit read - a2 holds instruction, a5 holds mask to apply to read value - */ - rsr.excvaddr a3 // read faulting address - ssa8l a3 /* sar is now shift to extract a3's byte */ - movi a4, ~3 - and a3, a3, a4 /* a3 now word aligned read address */ - - l32i a3, a3, 0 /* perform the actual read */ - srl a3, a3 /* shift right correct distance */ - and a4, a3, a5 /* mask off bits we need for an l8/l16 */ - - bbsi a5, 14, .Lmaybe_extend_sign -.Lafter_extend_sign: - /* a2 holds instruction, a4 holds the correctly read value */ - extui a2, a2, 4, 4 /* a2 now destination register 0-15 */ - - /* test if a4 needs to be written directly to a register (ie not a working register) */ - bgei a2, 6, .Lwrite_value_direct_reg - /* test if a4 needs to be written to a0 */ - beqz a2, .Lwrite_value_a0_reg - - /* otherwise, a4 can be written to a saved working register 'slot' on the stack */ - addx4 a5, a2, sp - s32i a4, a5, 0 - -.Lafter_write_value: - /* test PS.INTLEVEL (1=User, 2=Double) to see which interrupt level we restore from - */ - rsr.ps a2 - bbsi a2, 1, .Lincrement_PC_intlevel2 -.Lincrement_PC_intlevel1: - rsr.epc1 a2 - addi a3, a2, 0x3 - wsr.epc1 a3 - wsr.sar a0 // restore saved sar - rsr.excsave1 a0 // restore a0 saved in exception vector -.Lafter_increment_PC: - // Restore registers - l32i a2, sp, 0x08 - l32i a3, sp, 0x0c - l32i a4, sp, 0x10 - l32i a5, sp, 0x14 - addi sp, sp, 0x18 - rfe - - -/* Check the load instruction a2 for an l16si/16ui instruction - - First test for a signed vs unsigned load. - - a2 is the instruction, need to load a5 with the mask to use */ -.Lcheck_fix_16bit: - movi a4, 0x001002 /* l16si or l16ui opcode after masking */ - bne a3, a4, .Lcant_fix - - bbsi a2, 15, .Lcan_fix_16bit_signed - movi a5, 0xFFFF - j .Lcan_fix -.Lcan_fix_16bit_signed: - movi a5, 0x7FFF - j .Lcan_fix - -/* not an opcode we can try to fix, so bomb out - TODO: the exception dump will have some wrong values in it */ -.Lcant_fix: - call0 sdk_user_fatal_exception_handler - -/* increment PC for a DoubleException */ -.Lincrement_PC_intlevel2: - rsr.epc2 a2 - addi a3, a2, 0x3 - wsr.epc2 a3 - wsr.sar a0 // restore saved sar - rsr.excsave2 a0 // restore a0 saved in exception vector - j .Lafter_increment_PC - -.Lmaybe_extend_sign: /* apply 16-bit sign extension if necessary - a3 holds raw value, a4 holds masked */ - bbsi a5, 15, .Lafter_extend_sign /* 16-bit unsigned, no sign extension */ - bbci a3, 15, .Lafter_extend_sign /* sign bit not set, no sign extension */ - movi a3, 0xFFFF8000 - or a4, a3, a4 /* set 32-bit sign bits */ - j .Lafter_extend_sign - -.Lwrite_value_direct_reg: - /* Directly update register index a2, in range 6-15, using value in a4 */ - addi a2, a2, -6 - slli a2, a2, 3 /* offset from a6, x8 */ - movi a3, .Ldirect_reg_jumptable - add a2, a2, a3 - jx a2 - .align 8 -.Ldirect_reg_jumptable: - mov a6, a4 - j .Lafter_write_value - .align 8 - mov a7, a4 - j .Lafter_write_value - .align 8 - mov a8, a4 - j .Lafter_write_value - .align 8 - mov a9, a4 - j .Lafter_write_value - .align 8 - mov a10, a4 - j .Lafter_write_value - .align 8 - mov a11, a4 - j .Lafter_write_value - .align 8 - mov a12, a4 - j .Lafter_write_value - .align 8 - mov a13, a4 - j .Lafter_write_value - .align 8 - mov a14, a4 - j .Lafter_write_value - .align 8 - mov a15, a4 - j .Lafter_write_value - -.Lwrite_value_a0_reg: - /* a0 is saved in excsave1,so just update this with value - TODO: This won't work with interrupt level 2 - */ - wsr.excsave1 a4 - j .Lafter_write_value - - .literal_position -/* "Fix" LoadStoreException exceptions that are l8/l16 from an Instruction region, - DoubleException exception variant (ie load happened in a level1 exception handler). */ -DoubleExceptionLoadStoreHandler: - addi sp, sp, -0x18 - s32i a2, sp, 0x08 - rsr.epc2 a2 - j InnerLoadStoreExceptionHandler - -/* End of InnerUserLoadStoreExceptionHandler */ diff --git a/core/exception_vectors.S b/core/exception_vectors.S index 270b711..a577034 100644 --- a/core/exception_vectors.S +++ b/core/exception_vectors.S @@ -26,7 +26,6 @@ .text .section .vecbase.text, "x" .global VecBase - .type VecBase, @function /* it's not really a function, but treat it like one */ .org 0 VecBase: /* IMPORTANT: exception vector literals will go here, but we @@ -36,35 +35,38 @@ VecBase: */ .literal_position .org 0x10 + .type DebugExceptionVector, @function DebugExceptionVector: wsr.excsave2 a0 call0 sdk_user_fatal_exception_handler rfi 2 .org 0x20 + .type NMIExceptionVector, @function NMIExceptionVector: wsr.excsave3 a0 call0 CallNMIExceptionHandler rfi 3 /* CallNMIExceptionHandler should call rfi itself */ .org 0x30 + .type KernelExceptionVector, @function KernelExceptionVector: break 1, 0 call0 sdk_user_fatal_exception_handler rfe .org 0x50 + .type UserExceptionVector, @function UserExceptionVector: wsr.excsave1 a0 rsr.exccause a0 - beqi a0, CAUSE_LOADSTORE, UserExceptionLoadStoreHandler + beqi a0, CAUSE_LOADSTORE, LoadStoreErrorHandler j UserExceptionHandler .org 0x70 + .type DoubleExceptionVector, @function DoubleExceptionVector: break 1, 4 - rsr.exccause a0 - beqi a0, CAUSE_LOADSTORE, DoubleExceptionLoadStoreHandler call0 sdk_user_fatal_exception_handler /* Reset vector would go here at offset 0x80 but should be unused, @@ -72,10 +74,260 @@ DoubleExceptionVector: /***** end of exception vectors *****/ -/* We include this here so UserExceptionLoadStoreHandler is within - the range of a 'beq' instruction jump. +/* Xtensa Exception unaligned load handler + + Completes l8/l16 load instructions from Instruction address space, + for which the architecture only supports 32-bit reads. + + Called from UserExceptionVector if EXCCAUSE is LoadStoreErrorCause + + Fast path (no branches) is for l8ui. */ -#include "exception_unaligned_load.S.inc" + .literal_position + + .type LoadStoreErrorHandler, @function +LoadStoreErrorHandler: + # Note: we use a0 as our "stack pointer" here because it's already been + # saved in UserExceptionVector, and we never call out to other routines + # so we don't have to worry about it being clobbered. It would be + # preferable to use a1 instead, but this would require changes to other + # parts of UserExceptionHandler code which we haven't gotten around to + # yet. + # TODO: Eventually, switch everything over to saving a1 instead of a0 + # in UserExceptionVector so we can use the more mnemonic SP for this. + + # Note: registers are saved in the (regnum * 4) address so calculation + # is easier later on. This means we don't use the first two entries + # (since we don't save a0 or a1 here), so we just adjust the pointer in + # a0 to pretend we have two extra slots at the beginning. + movi a0, LoadStoreErrorHandlerStack - 8 + s32i a2, a0, 0x08 + s32i a3, a0, 0x0c + s32i a4, a0, 0x10 + s32i a5, a0, 0x14 + rsr.sar a5 # Save SAR in a5 to restore later + + # Examine the opcode which generated the exception + # Note: Instructions are in this order to avoid pipeline stalls. + rsr.epc1 a2 + movi a3, ~3 + ssa8l a2 // sar is now correct shift for aligned read + and a2, a2, a3 // a2 now 4-byte aligned address of instruction + l32i a4, a2, 0 + l32i a2, a2, 4 + movi a3, 0x00700F // opcode mask for l8ui/l16si/l16ui + src a2, a2, a4 // a2 now instruction that failed + and a3, a2, a3 + bnei a3, 0x000002, .LSE_check_l16 + + # Note: At this point, opcode could technically be one of two things: + # xx0xx2 (L8UI) + # xx8xx2 (Reserved (invalid) opcode) + # It is assumed that we'll never get to this point from an illegal + # opcode, so we don't bother to check for that case and presume this is + # always an L8UI. + + /* a2 holds instruction */ + movi a4, ~3 + rsr.excvaddr a3 // read faulting address + and a4, a3, a4 /* a4 now word aligned read address */ + + l32i a4, a4, 0 /* perform the actual read */ + ssa8l a3 /* sar is now shift to extract a3's byte */ + srl a3, a4 /* shift right correct distance */ + extui a4, a3, 0, 8 /* mask off bits we need for an l8 */ + +.LSE_post_fetch: + # We jump back here after either the L8UI or the L16*I routines do the + # necessary work to read the value from memory. + # At this point, a2 holds the faulting instruction and a4 holds the + # correctly read value. + + # Restore original SAR value (saved in a5) and update EPC so we'll + # return back to the instruction following the one we just emulated + # Note: Instructions are in this order to avoid pipeline stalls + rsr.epc1 a3 + wsr.sar a5 + addi a3, a3, 0x3 + wsr.epc1 a3 + + # Stupid opcode tricks: The jumptable we use later on needs 16 bytes + # per entry (so we can avoid a second jump by just doing a RFE inside + # each entry). Unfortunately, however, Xtensa doesn't have an addx16 + # operation to make that easy for us. Luckily, all of the faulting + # opcodes we're processing are guaranteed to have bit 3 be zero, which + # means if we just shift the register bits of the opcode down by 3 + # instead of 4, we will get the register number multiplied by 2. This + # combined with an addx8 will give us an effective addx16 without + # needing any extra shift operations. + extui a2, a2, 3, 5 /* a2 now destination register 0-15 times 2 */ + + bgei a2, 12, .LSE_assign_reg # a6..a15 use jumptable + blti a2, 4, .LSE_assign_reg # a0..a1 use jumptable + + # We're storing into a2..a5, which are all saved in our "stack" area. + # Calculate the correct address and stick the value in there, then just + # do our normal restore and RFE (no jumps required, which actually + # makes a2..a5 substantially faster). + addx2 a2, a2, a0 + s32i a4, a2, 0 + + # Restore all regs and return + l32i a2, a0, 0x08 + l32i a3, a0, 0x0c + l32i a4, a0, 0x10 + l32i a5, a0, 0x14 + rsr.excsave1 a0 # restore a0 saved by UserExceptionVector + rfe + +.LSE_assign_reg: + # At this point, a2 contains the register number times 2, a4 is the + # read value. + + movi a3, .LSE_assign_jumptable + addx8 a2, a2, a3 # a2 is now the address to jump to + + # Restore everything except a2 and a4 + l32i a3, a0, 0x0c + l32i a5, a0, 0x14 + + jx a2 + +/* Check the load instruction a2 for an l16si/16ui instruction + + a2 is the instruction, a3 is masked instruction */ + .balign 4 +.LSE_check_l16: + movi a4, 0x001002 /* l16si or l16ui opcode after masking */ + bne a3, a4, .LSE_wrong_opcode + + # Note: At this point, the opcode could be one of two things: + # xx1xx2 (L16UI) + # xx9xx2 (L16SI) + # Both of these we can handle. + + movi a4, ~3 + rsr.excvaddr a3 // read faulting address + and a4, a3, a4 /* a4 now word aligned read address */ + + l32i a4, a4, 0 /* perform the actual read */ + ssa8l a3 /* sar is now shift to extract a3's byte */ + srl a3, a4 /* shift right correct distance */ + extui a4, a3, 0, 16 /* mask off bits we need for an l16 */ + + bbci a2, 15, .LSE_post_fetch # Not a signed op + bbci a4, 15, .LSE_post_fetch # Value does not require sign-extension + + movi a3, 0xFFFF0000 + or a4, a3, a4 /* set 32-bit sign bits */ + j .LSE_post_fetch + +/* If we got here it's not an opcode we can try to fix, so bomb out */ +.LSE_wrong_opcode: + # Restore registers so any dump the fatal exception routine produces + # will have correct values + wsr.sar a5 # Restore SAR saved in a5 + l32i a2, a0, 0x08 + l32i a3, a0, 0x0c + l32i a4, a0, 0x10 + l32i a5, a0, 0x14 + call0 sdk_user_fatal_exception_handler + + .balign 4 +.LSE_assign_jumptable: + .org .LSE_assign_jumptable + (16 * 0) + # a0 is saved in excsave1, so just update that with the value + wsr.excsave1 a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 1) + mov a1, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + # NOTE: Opcodes a2 .. a5 are not handled by the jumptable routines + # (they're taken care of directly in .LSE_post_fetch above) + # This leaves 64 bytes of wasted space here. We could fill it with + # other things, but that would just make it harder to understand what's + # going on, and that's bad enough with this routine already. Even on + # the ESP8266, 64 bytes of IRAM wasted aren't the end of the world.. + + .org .LSE_assign_jumptable + (16 * 6) + mov a6, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 7) + mov a7, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 8) + mov a8, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 9) + mov a9, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 10) + mov a10, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 11) + mov a11, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 12) + mov a12, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 13) + mov a13, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 14) + mov a14, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + + .org .LSE_assign_jumptable + (16 * 15) + mov a15, a4 + l32i a2, a0, 0x08 + l32i a4, a0, 0x10 + rsr.excsave1 a0 + rfe + +/* End of LoadStoreErrorHandler */ .section .bss NMIHandlerStack: /* stack space for NMI handler */ @@ -84,6 +336,12 @@ NMIHandlerStack: /* stack space for NMI handler */ NMIRegisterSaved: /* register space for saving NMI registers */ .skip 4*(16 + 6) +LoadStoreErrorHandlerStack: + .word 0 # a2 + .word 0 # a3 + .word 0 # a4 + .word 0 # a5 + /* Save register relative to a0 */ .macro SAVE_REG register, regnum s32i \register, a0, (0x20 + 4 * \regnum) @@ -181,7 +439,7 @@ CallNMIExceptionHandler: .type UserExceptionHandler, @function UserExceptionHandler: - mov a0, sp /* a0 was saved in UserExceptionVector */ + mov a0, sp /* a0 was saved by UserExceptionVector */ addi sp, sp, -0x50 s32i a0, sp, 0x10 rsr.ps a0 @@ -214,7 +472,7 @@ UserHandleTimer: and a3, a2, a3 /* a3 = a2 & 0xFFBF, ie remove 0x40 from a2 if set */ bnez a3, UserTimerDone /* bits other than 0x40 are set */ movi a3, 0x40 - sub a12, a2, a3 /* a12 - a2 - 0x40 - I think a12 _must_ be zero here? */ + sub a12, a2, a3 /* a12 = a2 - 0x40 -- Will be zero if bit 6 set */ call0 sdk__xt_timer_int /* tick timer interrupt */ mov a2, a12 /* restore a2 from a12, ie zero */ beqz a2, UserIntDone @@ -226,7 +484,7 @@ UserIntDone: break 1, 1 /* non-zero remnant in a2 means fail */ call0 sdk_user_fatal_exception_handler UserIntExit: - call0 sdk__xt_int_exit /* calls rfi */ + call0 sdk__xt_int_exit /* jumps to _xt_user_exit. Never returns here */ /* _xt_user_exit is used to exit interrupt context. TODO: Find a better place for this to live.