I-RAM memory store support for 8 and 16 stores

2017-12-19 14:27:49 +11:00 · 2017-12-19 14:27:49 +11:00 · 8ea1774e1d
commit 8ea1774e1d
parent a8c60e0960
1 changed files with 191 additions and 61 deletions
--- a/core/exception_vectors.S
+++ b/core/exception_vectors.S
@ -44,6 +44,8 @@ LoadStoreErrorHandlerStack:
        .word   0       # a2
        .word   0       # a3
        .word   0       # a4
+        .word   0       # a5
+        .word   0       # a6

        .balign 4
        .global debug_saved_ctx
@ -123,7 +125,11 @@ DoubleExceptionVector:
 *
 * Called from UserExceptionVector if EXCCAUSE is LoadStoreErrorCause
 *
- * (Fast path (no branches) is for L8UI)
+ * Accesses can be assumed aligned here as unaligned accesses would have generate
+ * an unaligned exception (9) before getting here.
+ *
+ * Fast path (no branches) is for L8UI from low registers 0, 2-5, and the fast
+ * store path is S8I for low registers 0, 2-7.
 */
        .literal_position

@ -141,6 +147,8 @@ LoadStoreErrorHandler:
        s32i    a4, sp, 0x10
        rsr     a0, sar         # Save SAR in a0 to restore later

+        # led_on  a2, a3
+
        /* Examine the opcode which generated the exception */
        /* Note: Instructions are in this order to avoid pipeline stalls. */
        rsr     a2, epc1
@ -185,6 +193,8 @@ LoadStoreErrorHandler:
        addi    a3, a3, 0x3
        wsr     a3, epc1

+        # led_off  a0, a3
+
        /* Stupid opcode tricks: The jumptable we use later on needs 16 bytes
         * per entry (so we can avoid a second jump by just doing a RFE inside
         * each entry).  Unfortunately, however, Xtensa doesn't have an addx16
@ -214,25 +224,11 @@ LoadStoreErrorHandler:
        rsr     a1, excsave1    # restore a1 saved by UserExceptionVector
        rfe

-.LSE_assign_reg:
-        /* At this point, a2 contains the register number times 2, a4 is the
-         * read value. */
-
-        /* Calculate the jumptable address, and restore all regs except a2 and
-         * a4 so we have less to do after jumping. */
-        /* Note: Instructions are in this order to avoid pipeline stalls. */
-        movi    a3, .LSE_jumptable_base
-        l32i    a0, sp, 0
-        addx8   a2, a2, a3      # a2 is now the address to jump to
-        l32i    a3, sp, 0x0c
-
-        jx      a2
-
        .balign 4
 .LSE_check_l16:
        /* At this point, a2 contains the opcode, a3 is masked opcode */
        movi    a4, 0x001002    # l16si or l16ui opcode after masking
-        bne     a3, a4, .LSE_wrong_opcode
+        bne     a3, a4, .LSE_check_s8i

        /* Note: At this point, the opcode could be one of two things:
         *   xx1xx2 (L16UI)
@ -255,32 +251,91 @@ LoadStoreErrorHandler:
        or      a4, a3, a4      # set 32-bit sign bits
        j       .LSE_post_fetch

-.LSE_wrong_opcode:
-        /* If we got here it's not an opcode we can try to fix, so bomb out.
-         * Restore registers so any dump the fatal exception routine produces
-         * will have correct values */
-        wsr     a0, sar
-        l32i    a0, sp, 0
-        /*l32i    a2, sp, 0x08*/
-        l32i    a3, sp, 0x0c
-        l32i    a4, sp, 0x10
-        rsr     a1, excsave1
-        mov     a2, a1
-        movi    a3, 0
-        call0   fatal_exception_handler
-
        .balign 4
-.LSE_assign_a1:
-        /* a1 is saved in excsave1, so just update that with the value, */
-        wsr     a4, excsave1
-        /* Then restore all regs and return */
+.LSE_check_s8i:
+        /* At this point, a2 contains the opcode */
+        movi    a3, 0x00F00F    # opcode mask for s8i/s16i
+        s32i    a5, sp, 0x14    # Save a5, needed for store op
+        and     a3, a2, a3      # a3 is masked instruction
+        movi    a4, 0x004002    # s8i opcode after masking
+        s32i    a6, sp, 0x18    # Save a6, needed for store op
+        bne     a3, a4, .LSE_check_s16i
+
+        /* Note: At this point, the opcode is s8i */
+        movi    a5, 0x000000ff  # source mask
+.LSE_store:
+        /* We jump here for either S8I or S16I to get the address and load
+         * and mask the current contents. */
+        movi    a4, ~3
+        rsr     a3, excvaddr    # read faulting address
+        and     a4, a3, a4      # a4 now word aligned address
+        ssa8b   a3              # sar is now left shift amount
+        sll     a3, a5
+        movi    a6, 0xffffffff
+        xor     a6, a6, a3      # a6 now has the word mask
+        l32i    a3, a4, 0       # read the current word
+        and     a3, a3, a6      # a3 now has the masked word
+        extui   a2, a2, 4, 4    # a2 is now source register 0-15
+
+        /* At this point, a2 contains the source register 0-15, a3 contains the
+         * masked memory contents, a4 contains the address, a5 contains the source
+         * mask, and sar contains the left shift amount. */
+        bgei    a2, 7, .LSE_load_reg        # a7..a15 use jumptable
+        beqi    a2, 1, .LSE_load_a1         # a1 uses a special routine
+
+        /* We're loading from a0 or a2..a6, which are all saved in our "stack"
+         * area.  Calculate the correct address and load the value there. */
+        addx4   a2, a2, sp
+        l32i    a2, a2, 0
+
+.LSE_store_apply:
+        and     a2, a2, a5         # mask the source
+        sll     a2, a2             # shift the source
+        or      a3, a3, a2         # combine with the masked memory contents
+        s32i    a3, a4, 0          # write back to memory
+
+        /* Note: Instructions are in this order to avoid pipeline stalls */
+        rsr     a3, epc1
+        wsr     a0, sar
+        addi    a3, a3, 0x3
+        wsr     a3, epc1
+
+        # led_off  a2, a3
+
+        /* Restore all regs and return */
        l32i    a0, sp, 0
        l32i    a2, sp, 0x08
        l32i    a3, sp, 0x0c
        l32i    a4, sp, 0x10
-        rsr     a1, excsave1
+        l32i    a5, sp, 0x14
+        l32i    a6, sp, 0x18
+        rsr     a1, excsave1    # restore a1 saved by UserExceptionVector
        rfe

+        .balign 4
+.LSE_check_s16i:
+        /* At this point, a2 contains the opcode */
+        movi    a4, 0x005002    # s16i opcode after masking
+        bne     a3, a4, .LSE_wrong_opcode
+        /* Note: At this point, the opcode is s16i */
+        movi    a5, 0x0000ffff  # source mask
+        j       .LSE_store
+
+        .balign 4
+.LSE_assign_reg:
+        /* At this point, a2 contains the register number times 2, a4 is the
+         * read value. */
+
+        /* Calculate the jumptable address, and restore all regs except a2 and
+         * a4 so we have less to do after jumping. */
+        /* Note: Instructions are in this order to avoid pipeline stalls. */
+        movi    a3, .LSE_jumptable_base
+        l32i    a0, sp, 0
+        addx8   a2, a2, a3      # a2 is now the address to jump to
+        l32i    a3, sp, 0x0c
+
+        jx      a2
+
        .balign 4
 .LSE_jumptable:
        /* The first 5 entries (80 bytes) of this table are unused (registers
@ -366,6 +421,81 @@ LoadStoreErrorHandler:
        rsr     a1, excsave1
        rfe

+        .balign 4
+.LSE_assign_a1:
+        /* a1 is saved in excsave1, so just update that with the value, */
+        wsr     a4, excsave1
+        /* Then restore all regs and return */
+        l32i    a0, sp, 0
+        l32i    a2, sp, 0x08
+        l32i    a3, sp, 0x0c
+        l32i    a4, sp, 0x10
+        rsr     a1, excsave1
+        rfe
+
+        .balign 4
+.LSE_load_reg:
+        /* Calculate the jumptable address. */
+        movi    a6, .LSE_store_jumptable_base
+        addx8   a2, a2, a6      # a2 is now the address to jump to
+        jx      a2
+
+        .balign 4
+.LSE_store_jumptable:
+        /* The first 7 entries (56 bytes) of this table are unused (registers
+         * a0..a6 are handled separately above).  Rather than have a whole bunch
+         * of wasted space, we just pretend that the table starts 56 bytes
+         * earlier in memory. */
+        .set    .LSE_store_jumptable_base, .LSE_store_jumptable - (8 * 7)
+
+        mov     a2, a7
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a8
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a9
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a10
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a11
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a12
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a13
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a14
+        j       .LSE_store_apply
+        .balign 4
+        mov     a2, a15
+        j       .LSE_store_apply
+        .balign 4
+
+.LSE_load_a1:
+        /* a1 is saved in excsave1, so just read the value, */
+        rsr     a2, excsave1
+        j       .LSE_store_apply
+
+        .balign 4
+.LSE_wrong_opcode:
+        /* If we got here it's not an opcode we can try to fix, so bomb out.
+         * Restore registers so any dump the fatal exception routine produces
+         * will have correct values */
+        wsr     a0, sar
+        l32i    a0, sp, 0
+        /*l32i    a2, sp, 0x08*/
+        l32i    a3, sp, 0x0c
+        l32i    a4, sp, 0x10
+        rsr     a1, excsave1
+        mov     a2, a1
+        movi    a3, 0
+        call0   fatal_exception_handler
+
 /*************************** Debug exception handler *************************/

        .section .vecbase.text, "x"