From bcacfa426a2812e9b35bf7d20925be15bd85fd81 Mon Sep 17 00:00:00 2001
From: Alex Stewart <Alexander.Stewart@consensuscorp.com>
Date: Mon, 21 Sep 2015 22:13:14 -0700
Subject: [PATCH] Further optimizations for LoadStoreErrorHandler

---
 core/exception_unaligned_load.S.inc | 190 -------------------
 core/exception_vectors.S            | 278 +++++++++++++++++++++++++++-
 2 files changed, 268 insertions(+), 200 deletions(-)
 delete mode 100644 core/exception_unaligned_load.S.inc

diff --git a/core/exception_unaligned_load.S.inc b/core/exception_unaligned_load.S.inc
deleted file mode 100644
index 03c9830..0000000
--- a/core/exception_unaligned_load.S.inc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Xtensa Exception unaligned load handler
-
-   Completes l8/l16 load instructions from Instruction address space,
-   that the architecture require to be 4 byte aligned word reads.
-
-   Called from either UserExceptionVector or DoubleExceptionVector
-   depending on where the exception happened.
-
-   Fast path (no branches) is for l8ui.
-
-   Part of esp-open-rtos
-   Copyright (C) Angus Gratton
-   BSD Licensed as described in the file LICENSE
-*/
-       .text
-       .section .vecbase.text, "x"
-       .literal_position
-
-/* "Fix" LoadStoreException exceptions that are l8/l16 from an Instruction region,
-   normal exception variant. */
-UserExceptionLoadStoreHandler:
-	addi sp, sp, -0x18
-	s32i a2, sp, 0x08
-	rsr.epc1 a2
-/* Inner UserLoadStoreExceptionHandler handlers. Works for both level1 & level 2 interrupt level.
- *
- * Called from level-specific handler above which sets up stack and loads epcX into a2.
- */
-InnerLoadStoreExceptionHandler:
-	s32i a3, sp, 0x0c
-	s32i a4, sp, 0x10
-	s32i a5, sp, 0x14
-	rsr.sar a0 // save sar in a0
-
-	/* Examine the instruction we failed to execute (in a2) */
-	ssa8l a2 // sar is now correct shift for aligned read
-	movi	a3, ~3
-	and a2, a2, a3 // a2 now 4-byte aligned address of instruction
-	l32i a3, a2, 0
-	l32i a4, a2, 4
-	src a2, a4, a3 // a2 now instruction that failed
-
-	/* check for l8ui opcode 0x000002, or branch to check l16 */
-	movi a3, 0x00700F /* opcode mask for l8ui/l16si/l16ui */
-	and a3, a2, a3
-	bnei a3, 0x000002, .Lcheck_fix_16bit
-	movi a5, 0xFF
-
-.Lcan_fix:
-	/* verified an 8- or 16-bit read
-	a2 holds instruction, a5 holds mask to apply to read value
-	*/
-	rsr.excvaddr a3 // read faulting address
-	ssa8l a3 /* sar is now shift to extract a3's byte */
-	movi	a4, ~3
-	and	a3, a3, a4 /* a3 now word aligned read address */
-
-	l32i a3, a3, 0  /* perform the actual read */
-	srl a3, a3	/* shift right correct distance */
-	and a4, a3, a5  /* mask off bits we need for an l8/l16 */
-
-	bbsi a5, 14, .Lmaybe_extend_sign
-.Lafter_extend_sign:
-	/* a2 holds instruction, a4 holds the correctly read value */
-	extui a2, a2, 4, 4 /* a2 now destination register 0-15 */
-
-	/* test if a4 needs to be written directly to a register (ie not a working register) */
-	bgei a2, 6, .Lwrite_value_direct_reg
-	/* test if a4 needs to be written to a0 */
-	beqz a2, .Lwrite_value_a0_reg
-
-	/* otherwise, a4 can be written to a saved working register 'slot' on the stack */
-	addx4 a5, a2, sp
-	s32i a4, a5, 0
-
-.Lafter_write_value:
-	/* test PS.INTLEVEL (1=User, 2=Double) to see which interrupt level we restore from
-	*/
-	rsr.ps a2
-	bbsi a2, 1, .Lincrement_PC_intlevel2
-.Lincrement_PC_intlevel1:
-	rsr.epc1 a2
-	addi a3, a2, 0x3
-	wsr.epc1 a3
-	wsr.sar a0 // restore saved sar
-	rsr.excsave1 a0 // restore a0 saved in exception vector
-.Lafter_increment_PC:
-	// Restore registers
-	l32i a2, sp, 0x08
-	l32i a3, sp, 0x0c
-	l32i a4, sp, 0x10
-	l32i a5, sp, 0x14
-	addi sp, sp, 0x18
-	rfe
-
-
-/* Check the load instruction a2 for an l16si/16ui instruction
-
-   First test for a signed vs unsigned load.
-
-   a2 is the instruction, need to load a5 with the mask to use */
-.Lcheck_fix_16bit:
-	movi a4, 0x001002 /* l16si or l16ui opcode after masking */
-	bne a3, a4, .Lcant_fix
-
-	bbsi a2, 15, .Lcan_fix_16bit_signed
-	movi a5, 0xFFFF
-	j .Lcan_fix
-.Lcan_fix_16bit_signed:
-	movi a5, 0x7FFF
-	j .Lcan_fix
-
-/* not an opcode we can try to fix, so bomb out
-   TODO: the exception dump will have some wrong values in it */
-.Lcant_fix:
-	call0 sdk_user_fatal_exception_handler
-
-/* increment PC for a DoubleException */
-.Lincrement_PC_intlevel2:
-	rsr.epc2 a2
-	addi a3, a2, 0x3
-	wsr.epc2 a3
-	wsr.sar a0 // restore saved sar
-	rsr.excsave2 a0 // restore a0 saved in exception vector
-	j .Lafter_increment_PC
-
-.Lmaybe_extend_sign: /* apply 16-bit sign extension if necessary
-	                a3 holds raw value, a4 holds masked */
-	bbsi a5, 15, .Lafter_extend_sign /* 16-bit unsigned, no sign extension */
-	bbci a3, 15, .Lafter_extend_sign /* sign bit not set, no sign extension */
-	movi a3, 0xFFFF8000
-	or a4, a3, a4 /* set 32-bit sign bits */
-	j .Lafter_extend_sign
-
-.Lwrite_value_direct_reg:
-	/* Directly update register index a2, in range 6-15, using value in a4 */
-	addi a2, a2, -6
-	slli a2, a2, 3 /* offset from a6, x8 */
-	movi a3, .Ldirect_reg_jumptable
-	add a2, a2, a3
-	jx a2
-	.align 8
-.Ldirect_reg_jumptable:
-	mov a6, a4
-	j .Lafter_write_value
-	.align 8
-	mov a7, a4
-	j .Lafter_write_value
-	.align 8
-	mov a8, a4
-	j .Lafter_write_value
-	.align 8
-	mov a9, a4
-	j .Lafter_write_value
-	.align 8
-	mov a10, a4
-	j .Lafter_write_value
-	.align 8
-	mov a11, a4
-	j .Lafter_write_value
-	.align 8
-	mov a12, a4
-	j .Lafter_write_value
-	.align 8
-	mov a13, a4
-	j .Lafter_write_value
-	.align 8
-	mov a14, a4
-	j .Lafter_write_value
-	.align 8
-	mov a15, a4
-	j .Lafter_write_value
-
-.Lwrite_value_a0_reg:
-	/* a0 is saved in excsave1,so just update this with value
-           TODO: This won't work with interrupt level 2
-	*/
-	wsr.excsave1 a4
-	j .Lafter_write_value
-
-	.literal_position
-/* "Fix" LoadStoreException exceptions that are l8/l16 from an Instruction region,
-   DoubleException exception variant (ie load happened in a level1 exception handler). */
-DoubleExceptionLoadStoreHandler:
-	addi sp, sp, -0x18
-	s32i a2, sp, 0x08
-	rsr.epc2 a2
-	j InnerLoadStoreExceptionHandler
-
-/* End of InnerUserLoadStoreExceptionHandler */
diff --git a/core/exception_vectors.S b/core/exception_vectors.S
index 270b711..a577034 100644
--- a/core/exception_vectors.S
+++ b/core/exception_vectors.S
@@ -26,7 +26,6 @@
 	.text
 	.section .vecbase.text, "x"
         .global VecBase
-        .type   VecBase, @function /* it's not really a function, but treat it like one */
 	.org 0
 VecBase:
 	/* IMPORTANT: exception vector literals will go here, but we
@@ -36,35 +35,38 @@ VecBase:
 	*/
 	.literal_position
 	.org 0x10
+	.type   DebugExceptionVector, @function
 DebugExceptionVector:
 	wsr.excsave2 a0
 	call0 sdk_user_fatal_exception_handler
 	rfi 2
 
 	.org 0x20
+	.type   NMIExceptionVector, @function
 NMIExceptionVector:
 	wsr.excsave3 a0
 	call0 CallNMIExceptionHandler
 	rfi 3 /* CallNMIExceptionHandler should call rfi itself */
 
 	.org 0x30
+	.type   KernelExceptionVector, @function
 KernelExceptionVector:
 	break 1, 0
 	call0 sdk_user_fatal_exception_handler
 	rfe
 
 	.org 0x50
+	.type   UserExceptionVector, @function
 UserExceptionVector:
 	wsr.excsave1 a0
 	rsr.exccause a0
-	beqi a0, CAUSE_LOADSTORE, UserExceptionLoadStoreHandler
+	beqi a0, CAUSE_LOADSTORE, LoadStoreErrorHandler
 	j UserExceptionHandler
 
 	.org 0x70
+	.type   DoubleExceptionVector, @function
 DoubleExceptionVector:
 	break 1, 4
-	rsr.exccause a0
-	beqi a0, CAUSE_LOADSTORE, DoubleExceptionLoadStoreHandler
 	call0 sdk_user_fatal_exception_handler
 
 /* Reset vector would go here at offset 0x80 but should be unused,
@@ -72,10 +74,260 @@ DoubleExceptionVector:
 
 /***** end of exception vectors  *****/
 
-/* We include this here so UserExceptionLoadStoreHandler is within
-   the range of a 'beq' instruction jump.
+/* Xtensa Exception unaligned load handler
+
+   Completes l8/l16 load instructions from Instruction address space,
+   for which the architecture only supports 32-bit reads.
+
+   Called from UserExceptionVector if EXCCAUSE is LoadStoreErrorCause
+
+   Fast path (no branches) is for l8ui.
 */
-#include "exception_unaligned_load.S.inc"
+	.literal_position
+
+	.type   LoadStoreErrorHandler, @function
+LoadStoreErrorHandler:
+	# Note: we use a0 as our "stack pointer" here because it's already been
+	# saved in UserExceptionVector, and we never call out to other routines
+	# so we don't have to worry about it being clobbered.  It would be
+	# preferable to use a1 instead, but this would require changes to other
+	# parts of UserExceptionHandler code which we haven't gotten around to
+	# yet.
+	# TODO: Eventually, switch everything over to saving a1 instead of a0
+	# in UserExceptionVector so we can use the more mnemonic SP for this.
+
+	# Note: registers are saved in the (regnum * 4) address so calculation
+	# is easier later on.  This means we don't use the first two entries
+	# (since we don't save a0 or a1 here), so we just adjust the pointer in
+	# a0 to pretend we have two extra slots at the beginning.
+	movi	a0, LoadStoreErrorHandlerStack - 8
+	s32i	a2, a0, 0x08
+	s32i	a3, a0, 0x0c
+	s32i	a4, a0, 0x10
+	s32i	a5, a0, 0x14
+	rsr.sar a5              # Save SAR in a5 to restore later
+
+	# Examine the opcode which generated the exception
+	# Note: Instructions are in this order to avoid pipeline stalls.
+	rsr.epc1 a2
+	movi	a3, ~3
+	ssa8l	a2 // sar is now correct shift for aligned read
+	and	a2, a2, a3 // a2 now 4-byte aligned address of instruction
+	l32i	a4, a2, 0
+	l32i	a2, a2, 4
+	movi	a3, 0x00700F // opcode mask for l8ui/l16si/l16ui
+	src	a2, a2, a4   // a2 now instruction that failed
+	and	a3, a2, a3
+	bnei	a3, 0x000002, .LSE_check_l16
+
+	# Note: At this point, opcode could technically be one of two things:
+	#   xx0xx2 (L8UI)
+	#   xx8xx2 (Reserved (invalid) opcode)
+	# It is assumed that we'll never get to this point from an illegal
+	# opcode, so we don't bother to check for that case and presume this is
+	# always an L8UI.
+
+	/* a2 holds instruction */
+	movi	a4, ~3
+	rsr.excvaddr a3 // read faulting address
+	and	a4, a3, a4 /* a4 now word aligned read address */
+
+	l32i	a4, a4, 0  /* perform the actual read */
+	ssa8l	a3 /* sar is now shift to extract a3's byte */
+	srl	a3, a4	/* shift right correct distance */
+	extui	a4, a3, 0, 8 /* mask off bits we need for an l8 */
+
+.LSE_post_fetch:
+	# We jump back here after either the L8UI or the L16*I routines do the
+	# necessary work to read the value from memory.
+	# At this point, a2 holds the faulting instruction and a4 holds the
+	# correctly read value.
+
+	# Restore original SAR value (saved in a5) and update EPC so we'll
+	# return back to the instruction following the one we just emulated
+	# Note: Instructions are in this order to avoid pipeline stalls
+	rsr.epc1 a3
+	wsr.sar	a5
+	addi	a3, a3, 0x3
+	wsr.epc1 a3
+
+	# Stupid opcode tricks: The jumptable we use later on needs 16 bytes
+	# per entry (so we can avoid a second jump by just doing a RFE inside
+	# each entry).  Unfortunately, however, Xtensa doesn't have an addx16
+	# operation to make that easy for us.  Luckily, all of the faulting
+	# opcodes we're processing are guaranteed to have bit 3 be zero, which
+	# means if we just shift the register bits of the opcode down by 3
+	# instead of 4, we will get the register number multiplied by 2.  This
+	# combined with an addx8 will give us an effective addx16 without
+	# needing any extra shift operations.
+	extui a2, a2, 3, 5 /* a2 now destination register 0-15 times 2 */
+
+	bgei    a2, 12, .LSE_assign_reg  # a6..a15 use jumptable
+	blti    a2, 4, .LSE_assign_reg   # a0..a1 use jumptable
+
+	# We're storing into a2..a5, which are all saved in our "stack" area.
+	# Calculate the correct address and stick the value in there, then just
+	# do our normal restore and RFE (no jumps required, which actually
+	# makes a2..a5 substantially faster).
+	addx2	a2, a2, a0
+	s32i	a4, a2, 0
+
+	# Restore all regs and return
+	l32i	a2, a0, 0x08
+	l32i	a3, a0, 0x0c
+	l32i	a4, a0, 0x10
+	l32i	a5, a0, 0x14
+	rsr.excsave1 a0       # restore a0 saved by UserExceptionVector
+	rfe
+
+.LSE_assign_reg:
+	# At this point, a2 contains the register number times 2, a4 is the
+	# read value.
+
+	movi a3, .LSE_assign_jumptable
+	addx8 a2, a2, a3  # a2 is now the address to jump to
+
+	# Restore everything except a2 and a4
+	l32i	a3, a0, 0x0c
+	l32i	a5, a0, 0x14
+
+	jx a2
+
+/* Check the load instruction a2 for an l16si/16ui instruction
+
+   a2 is the instruction, a3 is masked instruction */
+	.balign 4
+.LSE_check_l16:
+	movi a4, 0x001002 /* l16si or l16ui opcode after masking */
+	bne a3, a4, .LSE_wrong_opcode
+
+	# Note: At this point, the opcode could be one of two things:
+	#   xx1xx2 (L16UI)
+	#   xx9xx2 (L16SI)
+	# Both of these we can handle.
+
+	movi	a4, ~3
+	rsr.excvaddr a3 // read faulting address
+	and	a4, a3, a4 /* a4 now word aligned read address */
+
+	l32i	a4, a4, 0  /* perform the actual read */
+	ssa8l	a3 /* sar is now shift to extract a3's byte */
+	srl	a3, a4	/* shift right correct distance */
+	extui	a4, a3, 0, 16 /* mask off bits we need for an l16 */
+
+	bbci	a2, 15, .LSE_post_fetch # Not a signed op
+	bbci	a4, 15, .LSE_post_fetch # Value does not require sign-extension
+
+	movi a3, 0xFFFF0000
+	or a4, a3, a4 /* set 32-bit sign bits */
+	j .LSE_post_fetch
+
+/* If we got here it's not an opcode we can try to fix, so bomb out */
+.LSE_wrong_opcode:
+	# Restore registers so any dump the fatal exception routine produces
+	# will have correct values
+	wsr.sar a5            # Restore SAR saved in a5
+	l32i	a2, a0, 0x08
+	l32i	a3, a0, 0x0c
+	l32i	a4, a0, 0x10
+	l32i	a5, a0, 0x14
+	call0	sdk_user_fatal_exception_handler
+
+	.balign 4
+.LSE_assign_jumptable:
+	.org    .LSE_assign_jumptable + (16 * 0)
+	# a0 is saved in excsave1, so just update that with the value
+	wsr.excsave1 a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 1)
+	mov	a1, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	# NOTE: Opcodes a2 .. a5 are not handled by the jumptable routines
+	# (they're taken care of directly in .LSE_post_fetch above)
+	# This leaves 64 bytes of wasted space here.  We could fill it with
+	# other things, but that would just make it harder to understand what's
+	# going on, and that's bad enough with this routine already.  Even on
+	# the ESP8266, 64 bytes of IRAM wasted aren't the end of the world..
+
+	.org    .LSE_assign_jumptable + (16 * 6)
+	mov	a6, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 7)
+	mov	a7, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 8)
+	mov	a8, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 9)
+	mov	a9, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 10)
+	mov	a10, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 11)
+	mov	a11, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 12)
+	mov	a12, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 13)
+	mov	a13, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 14)
+	mov	a14, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+	.org    .LSE_assign_jumptable + (16 * 15)
+	mov	a15, a4
+	l32i	a2, a0, 0x08
+	l32i	a4, a0, 0x10
+	rsr.excsave1 a0
+	rfe
+
+/* End of LoadStoreErrorHandler */
 
 	.section .bss
 NMIHandlerStack: /* stack space for NMI handler */
@@ -84,6 +336,12 @@ NMIHandlerStack: /* stack space for NMI handler */
 NMIRegisterSaved: /* register space for saving NMI registers */
 	.skip 4*(16 + 6)
 
+LoadStoreErrorHandlerStack:
+	.word	0	# a2
+	.word	0	# a3
+	.word	0	# a4
+	.word	0	# a5
+
 /* Save register relative to a0 */
 .macro SAVE_REG register, regnum
 	s32i \register, a0, (0x20 + 4 * \regnum)
@@ -181,7 +439,7 @@ CallNMIExceptionHandler:
 
         .type   UserExceptionHandler, @function
 UserExceptionHandler:
-	mov a0, sp /* a0 was saved in UserExceptionVector */
+	mov a0, sp /* a0 was saved by UserExceptionVector */
 	addi sp, sp, -0x50
 	s32i a0, sp, 0x10
 	rsr.ps a0
@@ -214,7 +472,7 @@ UserHandleTimer:
 	and a3, a2, a3 /* a3 = a2 & 0xFFBF, ie remove 0x40 from a2 if set */
 	bnez a3, UserTimerDone /* bits other than 0x40 are set */
 	movi a3, 0x40
-	sub a12, a2, a3 /* a12 - a2 - 0x40 - I think a12 _must_ be zero here? */
+	sub a12, a2, a3 /* a12 = a2 - 0x40 -- Will be zero if bit 6 set */
 	call0 sdk__xt_timer_int /* tick timer interrupt */
 	mov a2, a12 /* restore a2 from a12, ie zero */
 	beqz a2, UserIntDone
@@ -226,7 +484,7 @@ UserIntDone:
 	break 1, 1 /* non-zero remnant in a2 means fail */
 	call0 sdk_user_fatal_exception_handler
 UserIntExit:
-	call0 sdk__xt_int_exit /* calls rfi */
+	call0 sdk__xt_int_exit /* jumps to _xt_user_exit. Never returns here */
 
 /* _xt_user_exit is used to exit interrupt context.
     TODO: Find a better place for this to live.