diff options
Diffstat (limited to 'libgcc/config/sh')
-rw-r--r-- | libgcc/config/sh/lib1funcs.S | 3933 | ||||
-rw-r--r-- | libgcc/config/sh/lib1funcs.h | 76 | ||||
-rw-r--r-- | libgcc/config/sh/t-linux | 2 | ||||
-rw-r--r-- | libgcc/config/sh/t-netbsd | 1 | ||||
-rw-r--r-- | libgcc/config/sh/t-sh | 15 | ||||
-rw-r--r-- | libgcc/config/sh/t-sh64 | 6 |
6 files changed, 4029 insertions, 4 deletions
diff --git a/libgcc/config/sh/lib1funcs.S b/libgcc/config/sh/lib1funcs.S new file mode 100644 index 00000000000..2f0ca16cd91 --- /dev/null +++ b/libgcc/config/sh/lib1funcs.S @@ -0,0 +1,3933 @@ +/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006, 2009 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + +#include "lib1funcs.h" + +/* t-vxworks needs to build both PIC and non-PIC versions of libgcc, + so it is more convenient to define NO_FPSCR_VALUES here than to + define it on the command line. */ +#if defined __vxworks && defined __PIC__ +#define NO_FPSCR_VALUES +#endif + +#if ! __SH5__ +#ifdef L_ashiftrt + .global GLOBAL(ashiftrt_r4_0) + .global GLOBAL(ashiftrt_r4_1) + .global GLOBAL(ashiftrt_r4_2) + .global GLOBAL(ashiftrt_r4_3) + .global GLOBAL(ashiftrt_r4_4) + .global GLOBAL(ashiftrt_r4_5) + .global GLOBAL(ashiftrt_r4_6) + .global GLOBAL(ashiftrt_r4_7) + .global GLOBAL(ashiftrt_r4_8) + .global GLOBAL(ashiftrt_r4_9) + .global GLOBAL(ashiftrt_r4_10) + .global GLOBAL(ashiftrt_r4_11) + .global GLOBAL(ashiftrt_r4_12) + .global GLOBAL(ashiftrt_r4_13) + .global GLOBAL(ashiftrt_r4_14) + .global GLOBAL(ashiftrt_r4_15) + .global GLOBAL(ashiftrt_r4_16) + .global GLOBAL(ashiftrt_r4_17) + .global GLOBAL(ashiftrt_r4_18) + .global GLOBAL(ashiftrt_r4_19) + .global GLOBAL(ashiftrt_r4_20) + .global GLOBAL(ashiftrt_r4_21) + .global GLOBAL(ashiftrt_r4_22) + .global GLOBAL(ashiftrt_r4_23) + .global GLOBAL(ashiftrt_r4_24) + .global GLOBAL(ashiftrt_r4_25) + .global GLOBAL(ashiftrt_r4_26) + .global GLOBAL(ashiftrt_r4_27) + .global GLOBAL(ashiftrt_r4_28) + .global GLOBAL(ashiftrt_r4_29) + .global GLOBAL(ashiftrt_r4_30) + .global GLOBAL(ashiftrt_r4_31) + .global GLOBAL(ashiftrt_r4_32) + + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32)) + + .align 1 +GLOBAL(ashiftrt_r4_32): +GLOBAL(ashiftrt_r4_31): + rotcl r4 + rts + subc r4,r4 + +GLOBAL(ashiftrt_r4_30): + shar r4 +GLOBAL(ashiftrt_r4_29): + shar r4 +GLOBAL(ashiftrt_r4_28): + shar r4 +GLOBAL(ashiftrt_r4_27): + shar r4 +GLOBAL(ashiftrt_r4_26): + shar r4 +GLOBAL(ashiftrt_r4_25): + shar r4 +GLOBAL(ashiftrt_r4_24): + shlr16 r4 + shlr8 r4 + rts + exts.b r4,r4 + +GLOBAL(ashiftrt_r4_23): + shar r4 +GLOBAL(ashiftrt_r4_22): + shar r4 +GLOBAL(ashiftrt_r4_21): + shar r4 +GLOBAL(ashiftrt_r4_20): + shar r4 +GLOBAL(ashiftrt_r4_19): + shar r4 +GLOBAL(ashiftrt_r4_18): + shar r4 +GLOBAL(ashiftrt_r4_17): + shar r4 +GLOBAL(ashiftrt_r4_16): + shlr16 r4 + rts + exts.w r4,r4 + +GLOBAL(ashiftrt_r4_15): + shar r4 +GLOBAL(ashiftrt_r4_14): + shar r4 +GLOBAL(ashiftrt_r4_13): + shar r4 +GLOBAL(ashiftrt_r4_12): + shar r4 +GLOBAL(ashiftrt_r4_11): + shar r4 +GLOBAL(ashiftrt_r4_10): + shar r4 +GLOBAL(ashiftrt_r4_9): + shar r4 +GLOBAL(ashiftrt_r4_8): + shar r4 +GLOBAL(ashiftrt_r4_7): + shar r4 +GLOBAL(ashiftrt_r4_6): + shar r4 +GLOBAL(ashiftrt_r4_5): + shar r4 +GLOBAL(ashiftrt_r4_4): + shar r4 +GLOBAL(ashiftrt_r4_3): + shar r4 +GLOBAL(ashiftrt_r4_2): + shar r4 +GLOBAL(ashiftrt_r4_1): + rts + shar r4 + +GLOBAL(ashiftrt_r4_0): + rts + nop + + ENDFUNC(GLOBAL(ashiftrt_r4_0)) + ENDFUNC(GLOBAL(ashiftrt_r4_1)) + ENDFUNC(GLOBAL(ashiftrt_r4_2)) + ENDFUNC(GLOBAL(ashiftrt_r4_3)) + ENDFUNC(GLOBAL(ashiftrt_r4_4)) + ENDFUNC(GLOBAL(ashiftrt_r4_5)) + ENDFUNC(GLOBAL(ashiftrt_r4_6)) + ENDFUNC(GLOBAL(ashiftrt_r4_7)) + ENDFUNC(GLOBAL(ashiftrt_r4_8)) + ENDFUNC(GLOBAL(ashiftrt_r4_9)) + ENDFUNC(GLOBAL(ashiftrt_r4_10)) + ENDFUNC(GLOBAL(ashiftrt_r4_11)) + ENDFUNC(GLOBAL(ashiftrt_r4_12)) + ENDFUNC(GLOBAL(ashiftrt_r4_13)) + ENDFUNC(GLOBAL(ashiftrt_r4_14)) + ENDFUNC(GLOBAL(ashiftrt_r4_15)) + ENDFUNC(GLOBAL(ashiftrt_r4_16)) + ENDFUNC(GLOBAL(ashiftrt_r4_17)) + ENDFUNC(GLOBAL(ashiftrt_r4_18)) + ENDFUNC(GLOBAL(ashiftrt_r4_19)) + ENDFUNC(GLOBAL(ashiftrt_r4_20)) + ENDFUNC(GLOBAL(ashiftrt_r4_21)) + ENDFUNC(GLOBAL(ashiftrt_r4_22)) + ENDFUNC(GLOBAL(ashiftrt_r4_23)) + ENDFUNC(GLOBAL(ashiftrt_r4_24)) + ENDFUNC(GLOBAL(ashiftrt_r4_25)) + ENDFUNC(GLOBAL(ashiftrt_r4_26)) + ENDFUNC(GLOBAL(ashiftrt_r4_27)) + ENDFUNC(GLOBAL(ashiftrt_r4_28)) + ENDFUNC(GLOBAL(ashiftrt_r4_29)) + ENDFUNC(GLOBAL(ashiftrt_r4_30)) + ENDFUNC(GLOBAL(ashiftrt_r4_31)) + ENDFUNC(GLOBAL(ashiftrt_r4_32)) +#endif + +#ifdef L_ashiftrt_n + +! +! GLOBAL(ashrsi3) +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + + .global GLOBAL(ashrsi3) + HIDDEN_FUNC(GLOBAL(ashrsi3)) + .align 2 +GLOBAL(ashrsi3): + mov #31,r0 + and r0,r5 + mova LOCAL(ashrsi3_table),r0 + mov.b @(r0,r5),r5 +#ifdef __sh1__ + add r5,r0 + jmp @r0 +#else + braf r5 +#endif + mov r4,r0 + + .align 2 +LOCAL(ashrsi3_table): + .byte LOCAL(ashrsi3_0)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_1)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_2)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_3)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_4)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_5)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_6)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_7)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_8)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_9)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_10)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_11)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_12)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_13)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_14)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_15)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_16)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_17)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_18)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_19)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_20)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_21)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_22)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_23)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_24)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_25)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_26)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_27)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_28)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_29)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_30)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_31)-LOCAL(ashrsi3_table) + +LOCAL(ashrsi3_31): + rotcl r0 + rts + subc r0,r0 + +LOCAL(ashrsi3_30): + shar r0 +LOCAL(ashrsi3_29): + shar r0 +LOCAL(ashrsi3_28): + shar r0 +LOCAL(ashrsi3_27): + shar r0 +LOCAL(ashrsi3_26): + shar r0 +LOCAL(ashrsi3_25): + shar r0 +LOCAL(ashrsi3_24): + shlr16 r0 + shlr8 r0 + rts + exts.b r0,r0 + +LOCAL(ashrsi3_23): + shar r0 +LOCAL(ashrsi3_22): + shar r0 +LOCAL(ashrsi3_21): + shar r0 +LOCAL(ashrsi3_20): + shar r0 +LOCAL(ashrsi3_19): + shar r0 +LOCAL(ashrsi3_18): + shar r0 +LOCAL(ashrsi3_17): + shar r0 +LOCAL(ashrsi3_16): + shlr16 r0 + rts + exts.w r0,r0 + +LOCAL(ashrsi3_15): + shar r0 +LOCAL(ashrsi3_14): + shar r0 +LOCAL(ashrsi3_13): + shar r0 +LOCAL(ashrsi3_12): + shar r0 +LOCAL(ashrsi3_11): + shar r0 +LOCAL(ashrsi3_10): + shar r0 +LOCAL(ashrsi3_9): + shar r0 +LOCAL(ashrsi3_8): + shar r0 +LOCAL(ashrsi3_7): + shar r0 +LOCAL(ashrsi3_6): + shar r0 +LOCAL(ashrsi3_5): + shar r0 +LOCAL(ashrsi3_4): + shar r0 +LOCAL(ashrsi3_3): + shar r0 +LOCAL(ashrsi3_2): + shar r0 +LOCAL(ashrsi3_1): + rts + shar r0 + +LOCAL(ashrsi3_0): + rts + nop + + ENDFUNC(GLOBAL(ashrsi3)) +#endif + +#ifdef L_ashiftlt + +! +! GLOBAL(ashlsi3) +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + .global GLOBAL(ashlsi3) + HIDDEN_FUNC(GLOBAL(ashlsi3)) + .align 2 +GLOBAL(ashlsi3): + mov #31,r0 + and r0,r5 + mova LOCAL(ashlsi3_table),r0 + mov.b @(r0,r5),r5 +#ifdef __sh1__ + add r5,r0 + jmp @r0 +#else + braf r5 +#endif + mov r4,r0 + + .align 2 +LOCAL(ashlsi3_table): + .byte LOCAL(ashlsi3_0)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_1)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_2)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_3)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_4)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_5)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_6)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_7)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_8)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_9)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_10)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_11)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_12)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_13)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_14)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_15)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_16)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_17)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_18)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_19)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_20)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_21)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_22)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_23)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_24)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_25)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_26)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_27)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_28)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_29)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_30)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_31)-LOCAL(ashlsi3_table) + +LOCAL(ashlsi3_6): + shll2 r0 +LOCAL(ashlsi3_4): + shll2 r0 +LOCAL(ashlsi3_2): + rts + shll2 r0 + +LOCAL(ashlsi3_7): + shll2 r0 +LOCAL(ashlsi3_5): + shll2 r0 +LOCAL(ashlsi3_3): + shll2 r0 +LOCAL(ashlsi3_1): + rts + shll r0 + +LOCAL(ashlsi3_14): + shll2 r0 +LOCAL(ashlsi3_12): + shll2 r0 +LOCAL(ashlsi3_10): + shll2 r0 +LOCAL(ashlsi3_8): + rts + shll8 r0 + +LOCAL(ashlsi3_15): + shll2 r0 +LOCAL(ashlsi3_13): + shll2 r0 +LOCAL(ashlsi3_11): + shll2 r0 +LOCAL(ashlsi3_9): + shll8 r0 + rts + shll r0 + +LOCAL(ashlsi3_22): + shll2 r0 +LOCAL(ashlsi3_20): + shll2 r0 +LOCAL(ashlsi3_18): + shll2 r0 +LOCAL(ashlsi3_16): + rts + shll16 r0 + +LOCAL(ashlsi3_23): + shll2 r0 +LOCAL(ashlsi3_21): + shll2 r0 +LOCAL(ashlsi3_19): + shll2 r0 +LOCAL(ashlsi3_17): + shll16 r0 + rts + shll r0 + +LOCAL(ashlsi3_30): + shll2 r0 +LOCAL(ashlsi3_28): + shll2 r0 +LOCAL(ashlsi3_26): + shll2 r0 +LOCAL(ashlsi3_24): + shll16 r0 + rts + shll8 r0 + +LOCAL(ashlsi3_31): + shll2 r0 +LOCAL(ashlsi3_29): + shll2 r0 +LOCAL(ashlsi3_27): + shll2 r0 +LOCAL(ashlsi3_25): + shll16 r0 + shll8 r0 + rts + shll r0 + +LOCAL(ashlsi3_0): + rts + nop + + ENDFUNC(GLOBAL(ashlsi3)) +#endif + +#ifdef L_lshiftrt + +! +! GLOBAL(lshrsi3) +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + .global GLOBAL(lshrsi3) + HIDDEN_FUNC(GLOBAL(lshrsi3)) + .align 2 +GLOBAL(lshrsi3): + mov #31,r0 + and r0,r5 + mova LOCAL(lshrsi3_table),r0 + mov.b @(r0,r5),r5 +#ifdef __sh1__ + add r5,r0 + jmp @r0 +#else + braf r5 +#endif + mov r4,r0 + + .align 2 +LOCAL(lshrsi3_table): + .byte LOCAL(lshrsi3_0)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_1)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_2)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_3)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_4)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_5)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_6)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_7)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_8)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_9)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_10)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_11)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_12)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_13)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_14)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_15)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_16)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_17)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_18)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_19)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_20)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_21)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_22)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_23)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_24)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_25)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_26)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_27)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_28)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_29)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_30)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_31)-LOCAL(lshrsi3_table) + +LOCAL(lshrsi3_6): + shlr2 r0 +LOCAL(lshrsi3_4): + shlr2 r0 +LOCAL(lshrsi3_2): + rts + shlr2 r0 + +LOCAL(lshrsi3_7): + shlr2 r0 +LOCAL(lshrsi3_5): + shlr2 r0 +LOCAL(lshrsi3_3): + shlr2 r0 +LOCAL(lshrsi3_1): + rts + shlr r0 + +LOCAL(lshrsi3_14): + shlr2 r0 +LOCAL(lshrsi3_12): + shlr2 r0 +LOCAL(lshrsi3_10): + shlr2 r0 +LOCAL(lshrsi3_8): + rts + shlr8 r0 + +LOCAL(lshrsi3_15): + shlr2 r0 +LOCAL(lshrsi3_13): + shlr2 r0 +LOCAL(lshrsi3_11): + shlr2 r0 +LOCAL(lshrsi3_9): + shlr8 r0 + rts + shlr r0 + +LOCAL(lshrsi3_22): + shlr2 r0 +LOCAL(lshrsi3_20): + shlr2 r0 +LOCAL(lshrsi3_18): + shlr2 r0 +LOCAL(lshrsi3_16): + rts + shlr16 r0 + +LOCAL(lshrsi3_23): + shlr2 r0 +LOCAL(lshrsi3_21): + shlr2 r0 +LOCAL(lshrsi3_19): + shlr2 r0 +LOCAL(lshrsi3_17): + shlr16 r0 + rts + shlr r0 + +LOCAL(lshrsi3_30): + shlr2 r0 +LOCAL(lshrsi3_28): + shlr2 r0 +LOCAL(lshrsi3_26): + shlr2 r0 +LOCAL(lshrsi3_24): + shlr16 r0 + rts + shlr8 r0 + +LOCAL(lshrsi3_31): + shlr2 r0 +LOCAL(lshrsi3_29): + shlr2 r0 +LOCAL(lshrsi3_27): + shlr2 r0 +LOCAL(lshrsi3_25): + shlr16 r0 + shlr8 r0 + rts + shlr r0 + +LOCAL(lshrsi3_0): + rts + nop + + ENDFUNC(GLOBAL(lshrsi3)) +#endif + +#ifdef L_movmem + .text + .balign 4 + .global GLOBAL(movmem) + HIDDEN_FUNC(GLOBAL(movmem)) + HIDDEN_ALIAS(movstr,movmem) + /* This would be a lot simpler if r6 contained the byte count + minus 64, and we wouldn't be called here for a byte count of 64. */ +GLOBAL(movmem): + sts.l pr,@-r15 + shll2 r6 + bsr GLOBAL(movmemSI52+2) + mov.l @(48,r5),r0 + .balign 4 +LOCAL(movmem_loop): /* Reached with rts */ + mov.l @(60,r5),r0 + add #-64,r6 + mov.l r0,@(60,r4) + tst r6,r6 + mov.l @(56,r5),r0 + bt LOCAL(movmem_done) + mov.l r0,@(56,r4) + cmp/pl r6 + mov.l @(52,r5),r0 + add #64,r5 + mov.l r0,@(52,r4) + add #64,r4 + bt GLOBAL(movmemSI52) +! done all the large groups, do the remainder +! jump to movmem+ + mova GLOBAL(movmemSI4)+4,r0 + add r6,r0 + jmp @r0 +LOCAL(movmem_done): ! share slot insn, works out aligned. + lds.l @r15+,pr + mov.l r0,@(56,r4) + mov.l @(52,r5),r0 + rts + mov.l r0,@(52,r4) + .balign 4 +! ??? We need aliases movstr* for movmem* for the older libraries. These +! aliases will be removed at the some point in the future. + .global GLOBAL(movmemSI64) + HIDDEN_FUNC(GLOBAL(movmemSI64)) + HIDDEN_ALIAS(movstrSI64,movmemSI64) +GLOBAL(movmemSI64): + mov.l @(60,r5),r0 + mov.l r0,@(60,r4) + .global GLOBAL(movmemSI60) + HIDDEN_FUNC(GLOBAL(movmemSI60)) + HIDDEN_ALIAS(movstrSI60,movmemSI60) +GLOBAL(movmemSI60): + mov.l @(56,r5),r0 + mov.l r0,@(56,r4) + .global GLOBAL(movmemSI56) + HIDDEN_FUNC(GLOBAL(movmemSI56)) + HIDDEN_ALIAS(movstrSI56,movmemSI56) +GLOBAL(movmemSI56): + mov.l @(52,r5),r0 + mov.l r0,@(52,r4) + .global GLOBAL(movmemSI52) + HIDDEN_FUNC(GLOBAL(movmemSI52)) + HIDDEN_ALIAS(movstrSI52,movmemSI52) +GLOBAL(movmemSI52): + mov.l @(48,r5),r0 + mov.l r0,@(48,r4) + .global GLOBAL(movmemSI48) + HIDDEN_FUNC(GLOBAL(movmemSI48)) + HIDDEN_ALIAS(movstrSI48,movmemSI48) +GLOBAL(movmemSI48): + mov.l @(44,r5),r0 + mov.l r0,@(44,r4) + .global GLOBAL(movmemSI44) + HIDDEN_FUNC(GLOBAL(movmemSI44)) + HIDDEN_ALIAS(movstrSI44,movmemSI44) +GLOBAL(movmemSI44): + mov.l @(40,r5),r0 + mov.l r0,@(40,r4) + .global GLOBAL(movmemSI40) + HIDDEN_FUNC(GLOBAL(movmemSI40)) + HIDDEN_ALIAS(movstrSI40,movmemSI40) +GLOBAL(movmemSI40): + mov.l @(36,r5),r0 + mov.l r0,@(36,r4) + .global GLOBAL(movmemSI36) + HIDDEN_FUNC(GLOBAL(movmemSI36)) + HIDDEN_ALIAS(movstrSI36,movmemSI36) +GLOBAL(movmemSI36): + mov.l @(32,r5),r0 + mov.l r0,@(32,r4) + .global GLOBAL(movmemSI32) + HIDDEN_FUNC(GLOBAL(movmemSI32)) + HIDDEN_ALIAS(movstrSI32,movmemSI32) +GLOBAL(movmemSI32): + mov.l @(28,r5),r0 + mov.l r0,@(28,r4) + .global GLOBAL(movmemSI28) + HIDDEN_FUNC(GLOBAL(movmemSI28)) + HIDDEN_ALIAS(movstrSI28,movmemSI28) +GLOBAL(movmemSI28): + mov.l @(24,r5),r0 + mov.l r0,@(24,r4) + .global GLOBAL(movmemSI24) + HIDDEN_FUNC(GLOBAL(movmemSI24)) + HIDDEN_ALIAS(movstrSI24,movmemSI24) +GLOBAL(movmemSI24): + mov.l @(20,r5),r0 + mov.l r0,@(20,r4) + .global GLOBAL(movmemSI20) + HIDDEN_FUNC(GLOBAL(movmemSI20)) + HIDDEN_ALIAS(movstrSI20,movmemSI20) +GLOBAL(movmemSI20): + mov.l @(16,r5),r0 + mov.l r0,@(16,r4) + .global GLOBAL(movmemSI16) + HIDDEN_FUNC(GLOBAL(movmemSI16)) + HIDDEN_ALIAS(movstrSI16,movmemSI16) +GLOBAL(movmemSI16): + mov.l @(12,r5),r0 + mov.l r0,@(12,r4) + .global GLOBAL(movmemSI12) + HIDDEN_FUNC(GLOBAL(movmemSI12)) + HIDDEN_ALIAS(movstrSI12,movmemSI12) +GLOBAL(movmemSI12): + mov.l @(8,r5),r0 + mov.l r0,@(8,r4) + .global GLOBAL(movmemSI8) + HIDDEN_FUNC(GLOBAL(movmemSI8)) + HIDDEN_ALIAS(movstrSI8,movmemSI8) +GLOBAL(movmemSI8): + mov.l @(4,r5),r0 + mov.l r0,@(4,r4) + .global GLOBAL(movmemSI4) + HIDDEN_FUNC(GLOBAL(movmemSI4)) + HIDDEN_ALIAS(movstrSI4,movmemSI4) +GLOBAL(movmemSI4): + mov.l @(0,r5),r0 + rts + mov.l r0,@(0,r4) + + ENDFUNC(GLOBAL(movmemSI64)) + ENDFUNC(GLOBAL(movmemSI60)) + ENDFUNC(GLOBAL(movmemSI56)) + ENDFUNC(GLOBAL(movmemSI52)) + ENDFUNC(GLOBAL(movmemSI48)) + ENDFUNC(GLOBAL(movmemSI44)) + ENDFUNC(GLOBAL(movmemSI40)) + ENDFUNC(GLOBAL(movmemSI36)) + ENDFUNC(GLOBAL(movmemSI32)) + ENDFUNC(GLOBAL(movmemSI28)) + ENDFUNC(GLOBAL(movmemSI24)) + ENDFUNC(GLOBAL(movmemSI20)) + ENDFUNC(GLOBAL(movmemSI16)) + ENDFUNC(GLOBAL(movmemSI12)) + ENDFUNC(GLOBAL(movmemSI8)) + ENDFUNC(GLOBAL(movmemSI4)) + ENDFUNC(GLOBAL(movmem)) +#endif + +#ifdef L_movmem_i4 + .text + .global GLOBAL(movmem_i4_even) + .global GLOBAL(movmem_i4_odd) + .global GLOBAL(movmemSI12_i4) + + HIDDEN_FUNC(GLOBAL(movmem_i4_even)) + HIDDEN_FUNC(GLOBAL(movmem_i4_odd)) + HIDDEN_FUNC(GLOBAL(movmemSI12_i4)) + + HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even) + HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd) + HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4) + + .p2align 5 +L_movmem_2mod4_end: + mov.l r0,@(16,r4) + rts + mov.l r1,@(20,r4) + + .p2align 2 + +GLOBAL(movmem_i4_even): + mov.l @r5+,r0 + bra L_movmem_start_even + mov.l @r5+,r1 + +GLOBAL(movmem_i4_odd): + mov.l @r5+,r1 + add #-4,r4 + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r1,@(4,r4) + mov.l r2,@(8,r4) + +L_movmem_loop: + mov.l r3,@(12,r4) + dt r6 + mov.l @r5+,r0 + bt/s L_movmem_2mod4_end + mov.l @r5+,r1 + add #16,r4 +L_movmem_start_even: + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r0,@r4 + dt r6 + mov.l r1,@(4,r4) + bf/s L_movmem_loop + mov.l r2,@(8,r4) + rts + mov.l r3,@(12,r4) + + ENDFUNC(GLOBAL(movmem_i4_even)) + ENDFUNC(GLOBAL(movmem_i4_odd)) + + .p2align 4 +GLOBAL(movmemSI12_i4): + mov.l @r5,r0 + mov.l @(4,r5),r1 + mov.l @(8,r5),r2 + mov.l r0,@r4 + mov.l r1,@(4,r4) + rts + mov.l r2,@(8,r4) + + ENDFUNC(GLOBAL(movmemSI12_i4)) +#endif + +#ifdef L_mulsi3 + + + .global GLOBAL(mulsi3) + HIDDEN_FUNC(GLOBAL(mulsi3)) + +! r4 = aabb +! r5 = ccdd +! r0 = aabb*ccdd via partial products +! +! if aa == 0 and cc = 0 +! r0 = bb*dd +! +! else +! aa = bb*dd + (aa*dd*65536) + (cc*bb*65536) +! + +GLOBAL(mulsi3): + mulu.w r4,r5 ! multiply the lsws macl=bb*dd + mov r5,r3 ! r3 = ccdd + swap.w r4,r2 ! r2 = bbaa + xtrct r2,r3 ! r3 = aacc + tst r3,r3 ! msws zero ? + bf hiset + rts ! yes - then we have the answer + sts macl,r0 + +hiset: sts macl,r0 ! r0 = bb*dd + mulu.w r2,r5 ! brewing macl = aa*dd + sts macl,r1 + mulu.w r3,r4 ! brewing macl = cc*bb + sts macl,r2 + add r1,r2 + shll16 r2 + rts + add r2,r0 + + ENDFUNC(GLOBAL(mulsi3)) +#endif +#endif /* ! __SH5__ */ +#ifdef L_sdivsi3_i4 + .title "SH DIVIDE" +!! 4 byte integer Divide code for the Renesas SH +#ifdef __SH4__ +!! args in r4 and r5, result in fpul, clobber dr0, dr2 + + .global GLOBAL(sdivsi3_i4) + HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) +GLOBAL(sdivsi3_i4): + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + rts + ftrc dr0,fpul + + ENDFUNC(GLOBAL(sdivsi3_i4)) +#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__) +!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2 + +#if ! __SH5__ || __SH5__ == 32 +#if __SH5__ + .mode SHcompact +#endif + .global GLOBAL(sdivsi3_i4) + HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) +GLOBAL(sdivsi3_i4): + sts.l fpscr,@-r15 + mov #8,r2 + swap.w r2,r2 + lds r2,fpscr + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + lds.l @r15+,fpscr + + ENDFUNC(GLOBAL(sdivsi3_i4)) +#endif /* ! __SH5__ || __SH5__ == 32 */ +#endif /* ! __SH4__ */ +#endif + +#ifdef L_sdivsi3 +/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with + sh2e/sh3e code. */ +#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__) +!! +!! Steve Chamberlain +!! sac@cygnus.com +!! +!! + +!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit + + .global GLOBAL(sdivsi3) +#if __SHMEDIA__ +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif + .align 2 +#if 0 +/* The assembly code that follows is a hand-optimized version of the C + code that follows. Note that the registers that are modified are + exactly those listed as clobbered in the patterns divsi3_i1 and + divsi3_i1_media. + +int __sdivsi3 (i, j) + int i, j; +{ + register unsigned long long r18 asm ("r18"); + register unsigned long long r19 asm ("r19"); + register unsigned long long r0 asm ("r0") = 0; + register unsigned long long r1 asm ("r1") = 1; + register int r2 asm ("r2") = i >> 31; + register int r3 asm ("r3") = j >> 31; + + r2 = r2 ? r2 : r1; + r3 = r3 ? r3 : r1; + r18 = i * r2; + r19 = j * r3; + r2 *= r3; + + r19 <<= 31; + r1 <<= 31; + do + if (r18 >= r19) + r0 |= r1, r18 -= r19; + while (r19 >>= 1, r1 >>= 1); + + return r2 * (int)r0; +} +*/ +GLOBAL(sdivsi3): + pt/l LOCAL(sdivsi3_dontadd), tr2 + pt/l LOCAL(sdivsi3_loop), tr1 + ptabs/l r18, tr0 + movi 0, r0 + movi 1, r1 + shari.l r4, 31, r2 + shari.l r5, 31, r3 + cmveq r2, r1, r2 + cmveq r3, r1, r3 + muls.l r4, r2, r18 + muls.l r5, r3, r19 + muls.l r2, r3, r2 + shlli r19, 31, r19 + shlli r1, 31, r1 +LOCAL(sdivsi3_loop): + bgtu r19, r18, tr2 + or r0, r1, r0 + sub r18, r19, r18 +LOCAL(sdivsi3_dontadd): + shlri r1, 1, r1 + shlri r19, 1, r19 + bnei r1, 0, tr1 + muls.l r0, r2, r0 + add.l r0, r63, r0 + blink tr0, r63 +#elif 0 /* ! 0 */ + // inputs: r4,r5 + // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0 + // result in r0 +GLOBAL(sdivsi3): + // can create absolute value without extra latency, + // but dependent on proper sign extension of inputs: + // shari.l r5,31,r2 + // xor r5,r2,r20 + // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended. + shari.l r5,31,r2 + ori r2,1,r2 + muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended. + movi 0xffffffffffffbb0c,r19 // shift count eqiv 76 + shari.l r4,31,r3 + nsb r20,r0 + shlld r20,r0,r25 + shlri r25,48,r25 + sub r19,r25,r1 + mmulfx.w r1,r1,r2 + mshflo.w r1,r63,r1 + // If r4 was to be used in-place instead of r21, could use this sequence + // to compute absolute: + // sub r63,r4,r19 // compute absolute value of r4 + // shlri r4,32,r3 // into lower 32 bit of r4, keeping + // mcmv r19,r3,r4 // the sign in the upper 32 bits intact. + ori r3,1,r3 + mmulfx.w r25,r2,r2 + sub r19,r0,r0 + muls.l r4,r3,r21 + msub.w r1,r2,r2 + addi r2,-2,r1 + mulu.l r21,r1,r19 + mmulfx.w r2,r2,r2 + shlli r1,15,r1 + shlrd r19,r0,r19 + mulu.l r19,r20,r3 + mmacnfx.wl r25,r2,r1 + ptabs r18,tr0 + sub r21,r3,r25 + + mulu.l r25,r1,r2 + addi r0,14,r0 + xor r4,r5,r18 + shlrd r2,r0,r2 + mulu.l r2,r20,r3 + add r19,r2,r19 + shari.l r18,31,r18 + sub r25,r3,r25 + + mulu.l r25,r1,r2 + sub r25,r20,r25 + add r19,r18,r19 + shlrd r2,r0,r2 + mulu.l r2,r20,r3 + addi r25,1,r25 + add r19,r2,r19 + + cmpgt r25,r3,r25 + add.l r19,r25,r0 + xor r0,r18,r0 + blink tr0,r63 +#else /* ! 0 && ! 0 */ + + // inputs: r4,r5 + // clobbered: r1,r18,r19,r20,r21,r25,tr0 + // result in r0 + HIDDEN_FUNC(GLOBAL(sdivsi3_2)) +#ifndef __pic__ + FUNC(GLOBAL(sdivsi3)) +GLOBAL(sdivsi3): /* this is the shcompact entry point */ + // The special SHmedia entry point sdivsi3_1 prevents accidental linking + // with the SHcompact implementation, which clobbers tr1 / tr2. + .global GLOBAL(sdivsi3_1) +GLOBAL(sdivsi3_1): + .global GLOBAL(div_table_internal) + movi (GLOBAL(div_table_internal) >> 16) & 65535, r20 + shori GLOBAL(div_table_internal) & 65535, r20 +#endif + .global GLOBAL(sdivsi3_2) + // div_table in r20 + // clobbered: r1,r18,r19,r21,r25,tr0 +GLOBAL(sdivsi3_2): + nsb r5, r1 + shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62 + shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1) + ldx.ub r20, r21, r19 // u0.8 + shari r25, 32, r25 // normalize to s2.30 + shlli r21, 1, r21 + muls.l r25, r19, r19 // s2.38 + ldx.w r20, r21, r21 // s2.14 + ptabs r18, tr0 + shari r19, 24, r19 // truncate to s2.14 + sub r21, r19, r19 // some 11 bit inverse in s1.14 + muls.l r19, r19, r21 // u0.28 + sub r63, r1, r1 + addi r1, 92, r1 + muls.l r25, r21, r18 // s2.58 + shlli r19, 45, r19 // multiply by two and convert to s2.58 + /* bubble */ + sub r19, r18, r18 + shari r18, 28, r18 // some 22 bit inverse in s1.30 + muls.l r18, r25, r0 // s2.60 + muls.l r18, r4, r25 // s32.30 + /* bubble */ + shari r0, 16, r19 // s-16.44 + muls.l r19, r18, r19 // s-16.74 + shari r25, 63, r0 + shari r4, 14, r18 // s19.-14 + shari r19, 30, r19 // s-16.44 + muls.l r19, r18, r19 // s15.30 + xor r21, r0, r21 // You could also use the constant 1 << 27. + add r21, r25, r21 + sub r21, r19, r21 + shard r21, r1, r21 + sub r21, r0, r0 + blink tr0, r63 +#ifndef __pic__ + ENDFUNC(GLOBAL(sdivsi3)) +#endif + ENDFUNC(GLOBAL(sdivsi3_2)) +#endif +#elif defined __SHMEDIA__ +/* m5compact-nofpu */ + // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2 + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + FUNC(GLOBAL(sdivsi3)) +GLOBAL(sdivsi3): + pt/l LOCAL(sdivsi3_dontsub), tr0 + pt/l LOCAL(sdivsi3_loop), tr1 + ptabs/l r18,tr2 + shari.l r4,31,r18 + shari.l r5,31,r19 + xor r4,r18,r20 + xor r5,r19,r21 + sub.l r20,r18,r20 + sub.l r21,r19,r21 + xor r18,r19,r19 + shlli r21,32,r25 + addi r25,-1,r21 + addz.l r20,r63,r20 +LOCAL(sdivsi3_loop): + shlli r20,1,r20 + bgeu/u r21,r20,tr0 + sub r20,r21,r20 +LOCAL(sdivsi3_dontsub): + addi.l r25,-1,r25 + bnei r25,-32,tr1 + xor r20,r19,r20 + sub.l r20,r19,r0 + blink tr2,r63 + ENDFUNC(GLOBAL(sdivsi3)) +#else /* ! __SHMEDIA__ */ + FUNC(GLOBAL(sdivsi3)) +GLOBAL(sdivsi3): + mov r4,r1 + mov r5,r0 + + tst r0,r0 + bt div0 + mov #0,r2 + div0s r2,r1 + subc r3,r3 + subc r2,r1 + div0s r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + addc r2,r1 + rts + mov r1,r0 + + +div0: rts + mov #0,r0 + + ENDFUNC(GLOBAL(sdivsi3)) +#endif /* ! __SHMEDIA__ */ +#endif /* ! __SH4__ */ +#endif +#ifdef L_udivsi3_i4 + + .title "SH DIVIDE" +!! 4 byte integer Divide code for the Renesas SH +#ifdef __SH4__ +!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4, +!! and t bit + + .global GLOBAL(udivsi3_i4) + HIDDEN_FUNC(GLOBAL(udivsi3_i4)) +GLOBAL(udivsi3_i4): + mov #1,r1 + cmp/hi r1,r5 + bf trivial + rotr r1 + xor r1,r4 + lds r4,fpul + mova L1,r0 +#ifdef FMOVD_WORKS + fmov.d @r0+,dr4 +#else + fmov.s @r0+,DR40 + fmov.s @r0,DR41 +#endif + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + rts + ftrc dr0,fpul + +trivial: + rts + lds r4,fpul + + .align 2 +#ifdef FMOVD_WORKS + .align 3 ! make double below 8 byte aligned. +#endif +L1: + .double 2147483648 + + ENDFUNC(GLOBAL(udivsi3_i4)) +#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) +#if ! __SH5__ || __SH5__ == 32 +!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33 + .mode SHmedia + .global GLOBAL(udivsi3_i4) + HIDDEN_FUNC(GLOBAL(udivsi3_i4)) +GLOBAL(udivsi3_i4): + addz.l r4,r63,r20 + addz.l r5,r63,r21 + fmov.qd r20,dr0 + fmov.qd r21,dr32 + ptabs r18,tr0 + float.qd dr0,dr0 + float.qd dr32,dr32 + fdiv.d dr0,dr32,dr0 + ftrc.dq dr0,dr32 + fmov.s fr33,fr32 + blink tr0,r63 + + ENDFUNC(GLOBAL(udivsi3_i4)) +#endif /* ! __SH5__ || __SH5__ == 32 */ +#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) +!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 + + .global GLOBAL(udivsi3_i4) + HIDDEN_FUNC(GLOBAL(udivsi3_i4)) +GLOBAL(udivsi3_i4): + mov #1,r1 + cmp/hi r1,r5 + bf trivial + sts.l fpscr,@-r15 + mova L1,r0 + lds.l @r0+,fpscr + rotr r1 + xor r1,r4 + lds r4,fpul +#ifdef FMOVD_WORKS + fmov.d @r0+,dr4 +#else + fmov.s @r0+,DR40 + fmov.s @r0,DR41 +#endif + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + lds.l @r15+,fpscr + +#ifdef FMOVD_WORKS + .align 3 ! make double below 8 byte aligned. +#endif +trivial: + rts + lds r4,fpul + + .align 2 +L1: +#ifndef FMOVD_WORKS + .long 0x80000 +#else + .long 0x180000 +#endif + .double 2147483648 + + ENDFUNC(GLOBAL(udivsi3_i4)) +#endif /* ! __SH4__ */ +#endif + +#ifdef L_udivsi3 +/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with + sh2e/sh3e code. */ +#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__) + +!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit + .global GLOBAL(udivsi3) + HIDDEN_FUNC(GLOBAL(udivsi3)) + +#if __SHMEDIA__ +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif + .align 2 +#if 0 +/* The assembly code that follows is a hand-optimized version of the C + code that follows. Note that the registers that are modified are + exactly those listed as clobbered in the patterns udivsi3_i1 and + udivsi3_i1_media. + +unsigned +__udivsi3 (i, j) + unsigned i, j; +{ + register unsigned long long r0 asm ("r0") = 0; + register unsigned long long r18 asm ("r18") = 1; + register unsigned long long r4 asm ("r4") = i; + register unsigned long long r19 asm ("r19") = j; + + r19 <<= 31; + r18 <<= 31; + do + if (r4 >= r19) + r0 |= r18, r4 -= r19; + while (r19 >>= 1, r18 >>= 1); + + return r0; +} +*/ +GLOBAL(udivsi3): + pt/l LOCAL(udivsi3_dontadd), tr2 + pt/l LOCAL(udivsi3_loop), tr1 + ptabs/l r18, tr0 + movi 0, r0 + movi 1, r18 + addz.l r5, r63, r19 + addz.l r4, r63, r4 + shlli r19, 31, r19 + shlli r18, 31, r18 +LOCAL(udivsi3_loop): + bgtu r19, r4, tr2 + or r0, r18, r0 + sub r4, r19, r4 +LOCAL(udivsi3_dontadd): + shlri r18, 1, r18 + shlri r19, 1, r19 + bnei r18, 0, tr1 + blink tr0, r63 +#else +GLOBAL(udivsi3): + // inputs: r4,r5 + // clobbered: r18,r19,r20,r21,r22,r25,tr0 + // result in r0. + addz.l r5,r63,r22 + nsb r22,r0 + shlld r22,r0,r25 + shlri r25,48,r25 + movi 0xffffffffffffbb0c,r20 // shift count eqiv 76 + sub r20,r25,r21 + mmulfx.w r21,r21,r19 + mshflo.w r21,r63,r21 + ptabs r18,tr0 + mmulfx.w r25,r19,r19 + sub r20,r0,r0 + /* bubble */ + msub.w r21,r19,r19 + addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21 + before the msub.w, but we need a different value for + r19 to keep errors under control. */ + mulu.l r4,r21,r18 + mmulfx.w r19,r19,r19 + shlli r21,15,r21 + shlrd r18,r0,r18 + mulu.l r18,r22,r20 + mmacnfx.wl r25,r19,r21 + /* bubble */ + sub r4,r20,r25 + + mulu.l r25,r21,r19 + addi r0,14,r0 + /* bubble */ + shlrd r19,r0,r19 + mulu.l r19,r22,r20 + add r18,r19,r18 + /* bubble */ + sub.l r25,r20,r25 + + mulu.l r25,r21,r19 + addz.l r25,r63,r25 + sub r25,r22,r25 + shlrd r19,r0,r19 + mulu.l r19,r22,r20 + addi r25,1,r25 + add r18,r19,r18 + + cmpgt r25,r20,r25 + add.l r18,r25,r0 + blink tr0,r63 +#endif +#elif defined (__SHMEDIA__) +/* m5compact-nofpu - more emphasis on code size than on speed, but don't + ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4. + So use a short shmedia loop. */ + // clobbered: r20,r21,r25,tr0,tr1,tr2 + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 +GLOBAL(udivsi3): + pt/l LOCAL(udivsi3_dontsub), tr0 + pt/l LOCAL(udivsi3_loop), tr1 + ptabs/l r18,tr2 + shlli r5,32,r25 + addi r25,-1,r21 + addz.l r4,r63,r20 +LOCAL(udivsi3_loop): + shlli r20,1,r20 + bgeu/u r21,r20,tr0 + sub r20,r21,r20 +LOCAL(udivsi3_dontsub): + addi.l r25,-1,r25 + bnei r25,-32,tr1 + add.l r20,r63,r0 + blink tr2,r63 +#else /* ! defined (__SHMEDIA__) */ +LOCAL(div8): + div1 r5,r4 +LOCAL(div7): + div1 r5,r4; div1 r5,r4; div1 r5,r4 + div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 + +LOCAL(divx4): + div1 r5,r4; rotcl r0 + div1 r5,r4; rotcl r0 + div1 r5,r4; rotcl r0 + rts; div1 r5,r4 + +GLOBAL(udivsi3): + sts.l pr,@-r15 + extu.w r5,r0 + cmp/eq r5,r0 +#ifdef __sh1__ + bf LOCAL(large_divisor) +#else + bf/s LOCAL(large_divisor) +#endif + div0u + swap.w r4,r0 + shlr16 r4 + bsr LOCAL(div8) + shll16 r5 + bsr LOCAL(div7) + div1 r5,r4 + xtrct r4,r0 + xtrct r0,r4 + bsr LOCAL(div8) + swap.w r4,r4 + bsr LOCAL(div7) + div1 r5,r4 + lds.l @r15+,pr + xtrct r4,r0 + swap.w r0,r0 + rotcl r0 + rts + shlr16 r5 + +LOCAL(large_divisor): +#ifdef __sh1__ + div0u +#endif + mov #0,r0 + xtrct r4,r0 + xtrct r0,r4 + bsr LOCAL(divx4) + rotcl r0 + bsr LOCAL(divx4) + rotcl r0 + bsr LOCAL(divx4) + rotcl r0 + bsr LOCAL(divx4) + rotcl r0 + lds.l @r15+,pr + rts + rotcl r0 + + ENDFUNC(GLOBAL(udivsi3)) +#endif /* ! __SHMEDIA__ */ +#endif /* __SH4__ */ +#endif /* L_udivsi3 */ + +#ifdef L_udivdi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(udivdi3) + FUNC(GLOBAL(udivdi3)) +GLOBAL(udivdi3): + HIDDEN_ALIAS(udivdi3_internal,udivdi3) + shlri r3,1,r4 + nsb r4,r22 + shlld r3,r22,r6 + shlri r6,49,r5 + movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ + sub r21,r5,r1 + mmulfx.w r1,r1,r4 + mshflo.w r1,r63,r1 + sub r63,r22,r20 // r63 == 64 % 64 + mmulfx.w r5,r4,r4 + pta LOCAL(large_divisor),tr0 + addi r20,32,r9 + msub.w r1,r4,r1 + madd.w r1,r1,r1 + mmulfx.w r1,r1,r4 + shlri r6,32,r7 + bgt/u r9,r63,tr0 // large_divisor + mmulfx.w r5,r4,r4 + shlri r2,32+14,r19 + addi r22,-31,r0 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r19,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + mulu.l r5,r3,r8 + mshalds.l r1,r21,r1 + shari r4,26,r4 + shlld r8,r0,r8 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r2,r8,r2 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ + + shlri r2,22,r21 + mulu.l r21,r1,r21 + shlld r5,r0,r8 + addi r20,30-22,r0 + shlrd r21,r0,r21 + mulu.l r21,r3,r5 + add r8,r21,r8 + mcmpgt.l r21,r63,r21 // See Note 1 + addi r20,30,r0 + mshfhi.l r63,r21,r21 + sub r2,r5,r2 + andc r2,r21,r2 + + /* small divisor: need a third divide step */ + mulu.l r2,r1,r7 + ptabs r18,tr0 + addi r2,1,r2 + shlrd r7,r0,r7 + mulu.l r7,r3,r5 + add r8,r7,r8 + sub r2,r3,r2 + cmpgt r2,r5,r5 + add r8,r5,r2 + /* could test r3 here to check for divide by zero. */ + blink tr0,r63 + +LOCAL(large_divisor): + mmulfx.w r5,r4,r4 + shlrd r2,r9,r25 + shlri r25,32,r8 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r8,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + shlri r5,14-1,r8 + mulu.l r8,r7,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r25,r5,r25 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ + + shlri r25,22,r21 + mulu.l r21,r1,r21 + pta LOCAL(no_lo_adj),tr0 + addi r22,32,r0 + shlri r21,40,r21 + mulu.l r21,r7,r5 + add r8,r21,r8 + shlld r2,r0,r2 + sub r25,r5,r25 + bgtu/u r7,r25,tr0 // no_lo_adj + addi r8,1,r8 + sub r25,r7,r25 +LOCAL(no_lo_adj): + mextr4 r2,r25,r2 + + /* large_divisor: only needs a few adjustments. */ + mulu.l r8,r6,r5 + ptabs r18,tr0 + /* bubble */ + cmpgtu r5,r2,r5 + sub r8,r5,r2 + blink tr0,r63 + ENDFUNC(GLOBAL(udivdi3)) +/* Note 1: To shift the result of the second divide stage so that the result + always fits into 32 bits, yet we still reduce the rest sufficiently + would require a lot of instructions to do the shifts just right. Using + the full 64 bit shift result to multiply with the divisor would require + four extra instructions for the upper 32 bits (shift / mulu / shift / sub). + Fortunately, if the upper 32 bits of the shift result are nonzero, we + know that the rest after taking this partial result into account will + fit into 32 bits. So we just clear the upper 32 bits of the rest if the + upper 32 bits of the partial result are nonzero. */ +#endif /* __SHMEDIA__ */ +#endif /* L_udivdi3 */ + +#ifdef L_divdi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(divdi3) + FUNC(GLOBAL(divdi3)) +GLOBAL(divdi3): + pta GLOBAL(udivdi3_internal),tr0 + shari r2,63,r22 + shari r3,63,r23 + xor r2,r22,r2 + xor r3,r23,r3 + sub r2,r22,r2 + sub r3,r23,r3 + beq/u r22,r23,tr0 + ptabs r18,tr1 + blink tr0,r18 + sub r63,r2,r2 + blink tr1,r63 + ENDFUNC(GLOBAL(divdi3)) +#endif /* __SHMEDIA__ */ +#endif /* L_divdi3 */ + +#ifdef L_umoddi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(umoddi3) + FUNC(GLOBAL(umoddi3)) +GLOBAL(umoddi3): + HIDDEN_ALIAS(umoddi3_internal,umoddi3) + shlri r3,1,r4 + nsb r4,r22 + shlld r3,r22,r6 + shlri r6,49,r5 + movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ + sub r21,r5,r1 + mmulfx.w r1,r1,r4 + mshflo.w r1,r63,r1 + sub r63,r22,r20 // r63 == 64 % 64 + mmulfx.w r5,r4,r4 + pta LOCAL(large_divisor),tr0 + addi r20,32,r9 + msub.w r1,r4,r1 + madd.w r1,r1,r1 + mmulfx.w r1,r1,r4 + shlri r6,32,r7 + bgt/u r9,r63,tr0 // large_divisor + mmulfx.w r5,r4,r4 + shlri r2,32+14,r19 + addi r22,-31,r0 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r19,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + mulu.l r5,r3,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + shlld r5,r0,r5 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r2,r5,r2 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ + + shlri r2,22,r21 + mulu.l r21,r1,r21 + addi r20,30-22,r0 + /* bubble */ /* could test r3 here to check for divide by zero. */ + shlrd r21,r0,r21 + mulu.l r21,r3,r5 + mcmpgt.l r21,r63,r21 // See Note 1 + addi r20,30,r0 + mshfhi.l r63,r21,r21 + sub r2,r5,r2 + andc r2,r21,r2 + + /* small divisor: need a third divide step */ + mulu.l r2,r1,r7 + ptabs r18,tr0 + sub r2,r3,r8 /* re-use r8 here for rest - r3 */ + shlrd r7,r0,r7 + mulu.l r7,r3,r5 + /* bubble */ + addi r8,1,r7 + cmpgt r7,r5,r7 + cmvne r7,r8,r2 + sub r2,r5,r2 + blink tr0,r63 + +LOCAL(large_divisor): + mmulfx.w r5,r4,r4 + shlrd r2,r9,r25 + shlri r25,32,r8 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r8,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + shlri r5,14-1,r8 + mulu.l r8,r7,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r25,r5,r25 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ + + shlri r25,22,r21 + mulu.l r21,r1,r21 + pta LOCAL(no_lo_adj),tr0 + addi r22,32,r0 + shlri r21,40,r21 + mulu.l r21,r7,r5 + add r8,r21,r8 + shlld r2,r0,r2 + sub r25,r5,r25 + bgtu/u r7,r25,tr0 // no_lo_adj + addi r8,1,r8 + sub r25,r7,r25 +LOCAL(no_lo_adj): + mextr4 r2,r25,r2 + + /* large_divisor: only needs a few adjustments. */ + mulu.l r8,r6,r5 + ptabs r18,tr0 + add r2,r6,r7 + cmpgtu r5,r2,r8 + cmvne r8,r7,r2 + sub r2,r5,r2 + shlrd r2,r22,r2 + blink tr0,r63 + ENDFUNC(GLOBAL(umoddi3)) +/* Note 1: To shift the result of the second divide stage so that the result + always fits into 32 bits, yet we still reduce the rest sufficiently + would require a lot of instructions to do the shifts just right. Using + the full 64 bit shift result to multiply with the divisor would require + four extra instructions for the upper 32 bits (shift / mulu / shift / sub). + Fortunately, if the upper 32 bits of the shift result are nonzero, we + know that the rest after taking this partial result into account will + fit into 32 bits. So we just clear the upper 32 bits of the rest if the + upper 32 bits of the partial result are nonzero. */ +#endif /* __SHMEDIA__ */ +#endif /* L_umoddi3 */ + +#ifdef L_moddi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(moddi3) + FUNC(GLOBAL(moddi3)) +GLOBAL(moddi3): + pta GLOBAL(umoddi3_internal),tr0 + shari r2,63,r22 + shari r3,63,r23 + xor r2,r22,r2 + xor r3,r23,r3 + sub r2,r22,r2 + sub r3,r23,r3 + beq/u r22,r63,tr0 + ptabs r18,tr1 + blink tr0,r18 + sub r63,r2,r2 + blink tr1,r63 + ENDFUNC(GLOBAL(moddi3)) +#endif /* __SHMEDIA__ */ +#endif /* L_moddi3 */ + +#ifdef L_set_fpscr +#if !defined (__SH2A_NOFPU__) +#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32 +#ifdef __SH5__ + .mode SHcompact +#endif + .global GLOBAL(set_fpscr) + HIDDEN_FUNC(GLOBAL(set_fpscr)) +GLOBAL(set_fpscr): + lds r4,fpscr +#ifdef __PIC__ + mov.l r12,@-r15 +#ifdef __vxworks + mov.l LOCAL(set_fpscr_L0_base),r12 + mov.l LOCAL(set_fpscr_L0_index),r0 + mov.l @r12,r12 + mov.l @(r0,r12),r12 +#else + mova LOCAL(set_fpscr_L0),r0 + mov.l LOCAL(set_fpscr_L0),r12 + add r0,r12 +#endif + mov.l LOCAL(set_fpscr_L1),r0 + mov.l @(r0,r12),r1 + mov.l @r15+,r12 +#else + mov.l LOCAL(set_fpscr_L1),r1 +#endif + swap.w r4,r0 + or #24,r0 +#ifndef FMOVD_WORKS + xor #16,r0 +#endif +#if defined(__SH4__) || defined (__SH2A_DOUBLE__) + swap.w r0,r3 + mov.l r3,@(4,r1) +#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ + swap.w r0,r2 + mov.l r2,@r1 +#endif +#ifndef FMOVD_WORKS + xor #8,r0 +#else + xor #24,r0 +#endif +#if defined(__SH4__) || defined (__SH2A_DOUBLE__) + swap.w r0,r2 + rts + mov.l r2,@r1 +#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ + swap.w r0,r3 + rts + mov.l r3,@(4,r1) +#endif + .align 2 +#ifdef __PIC__ +#ifdef __vxworks +LOCAL(set_fpscr_L0_base): + .long ___GOTT_BASE__ +LOCAL(set_fpscr_L0_index): + .long ___GOTT_INDEX__ +#else +LOCAL(set_fpscr_L0): + .long _GLOBAL_OFFSET_TABLE_ +#endif +LOCAL(set_fpscr_L1): + .long GLOBAL(fpscr_values@GOT) +#else +LOCAL(set_fpscr_L1): + .long GLOBAL(fpscr_values) +#endif + + ENDFUNC(GLOBAL(set_fpscr)) +#ifndef NO_FPSCR_VALUES +#ifdef __ELF__ + .comm GLOBAL(fpscr_values),8,4 +#else + .comm GLOBAL(fpscr_values),8 +#endif /* ELF */ +#endif /* NO_FPSCR_VALUES */ +#endif /* SH2E / SH3E / SH4 */ +#endif /* __SH2A_NOFPU__ */ +#endif /* L_set_fpscr */ +#ifdef L_ic_invalidate +#if __SH5__ == 32 + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(init_trampoline) + HIDDEN_FUNC(GLOBAL(init_trampoline)) +GLOBAL(init_trampoline): + st.l r0,8,r2 +#ifdef __LITTLE_ENDIAN__ + movi 9,r20 + shori 0x402b,r20 + shori 0xd101,r20 + shori 0xd002,r20 +#else + movi 0xffffffffffffd002,r20 + shori 0xd101,r20 + shori 0x402b,r20 + shori 9,r20 +#endif + st.q r0,0,r20 + st.l r0,12,r3 + ENDFUNC(GLOBAL(init_trampoline)) + .global GLOBAL(ic_invalidate) + HIDDEN_FUNC(GLOBAL(ic_invalidate)) +GLOBAL(ic_invalidate): + ocbwb r0,0 + synco + icbi r0, 0 + ptabs r18, tr0 + synci + blink tr0, r63 + ENDFUNC(GLOBAL(ic_invalidate)) +#elif defined(__SH4A__) + .global GLOBAL(ic_invalidate) + HIDDEN_FUNC(GLOBAL(ic_invalidate)) +GLOBAL(ic_invalidate): + ocbwb @r4 + synco + icbi @r4 + rts + nop + ENDFUNC(GLOBAL(ic_invalidate)) +#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)) + /* For system code, we use ic_invalidate_line_i, but user code + needs a different mechanism. A kernel call is generally not + available, and it would also be slow. Different SH4 variants use + different sizes and associativities of the Icache. We use a small + bit of dispatch code that can be put hidden in every shared object, + which calls the actual processor-specific invalidation code in a + separate module. + Or if you have operating system support, the OS could mmap the + procesor-specific code from a single page, since it is highly + repetitive. */ + .global GLOBAL(ic_invalidate) + HIDDEN_FUNC(GLOBAL(ic_invalidate)) +GLOBAL(ic_invalidate): +#ifdef __pic__ +#ifdef __vxworks + mov.l 1f,r1 + mov.l 2f,r0 + mov.l @r1,r1 + mov.l 0f,r2 + mov.l @(r0,r1),r0 +#else + mov.l 1f,r1 + mova 1f,r0 + mov.l 0f,r2 + add r1,r0 +#endif + mov.l @(r0,r2),r1 +#else + mov.l 0f,r1 +#endif + ocbwb @r4 + mov.l @(8,r1),r0 + sub r1,r4 + and r4,r0 + add r1,r0 + jmp @r0 + mov.l @(4,r1),r0 + .align 2 +#ifndef __pic__ +0: .long GLOBAL(ic_invalidate_array) +#else /* __pic__ */ + .global GLOBAL(ic_invalidate_array) +0: .long GLOBAL(ic_invalidate_array)@GOT +#ifdef __vxworks +1: .long ___GOTT_BASE__ +2: .long ___GOTT_INDEX__ +#else +1: .long _GLOBAL_OFFSET_TABLE_ +#endif + ENDFUNC(GLOBAL(ic_invalidate)) +#endif /* __pic__ */ +#endif /* SH4 */ +#endif /* L_ic_invalidate */ + +#ifdef L_ic_invalidate_array +#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)))) + .global GLOBAL(ic_invalidate_array) + /* This is needed when an SH4 dso with trampolines is used on SH4A. */ + .global GLOBAL(ic_invalidate_array) + FUNC(GLOBAL(ic_invalidate_array)) +GLOBAL(ic_invalidate_array): + add r1,r4 + synco + icbi @r4 + rts + nop + .align 2 + .long 0 + ENDFUNC(GLOBAL(ic_invalidate_array)) +#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)) + .global GLOBAL(ic_invalidate_array) + .p2align 5 + FUNC(GLOBAL(ic_invalidate_array)) +/* This must be aligned to the beginning of a cache line. */ +GLOBAL(ic_invalidate_array): +#ifndef WAYS +#define WAYS 4 +#define WAY_SIZE 0x4000 +#endif +#if WAYS == 1 + .rept WAY_SIZE * WAYS / 32 + rts + nop + .rept 7 + .long WAY_SIZE - 32 + .endr + .endr +#elif WAYS <= 6 + .rept WAY_SIZE * WAYS / 32 + braf r0 + add #-8,r0 + .long WAY_SIZE + 8 + .long WAY_SIZE - 32 + .rept WAYS-2 + braf r0 + nop + .endr + .rept 7 - WAYS + rts + nop + .endr + .endr +#else /* WAYS > 6 */ + /* This variant needs two different pages for mmap-ing. */ + .rept WAYS-1 + .rept WAY_SIZE / 32 + braf r0 + nop + .long WAY_SIZE + .rept 6 + .long WAY_SIZE - 32 + .endr + .endr + .endr + .rept WAY_SIZE / 32 + rts + .rept 15 + nop + .endr + .endr +#endif /* WAYS */ + ENDFUNC(GLOBAL(ic_invalidate_array)) +#endif /* SH4 */ +#endif /* L_ic_invalidate_array */ + +#if defined (__SH5__) && __SH5__ == 32 +#ifdef L_shcompact_call_trampoline + .section .rodata + .align 1 +LOCAL(ct_main_table): +.word LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label) + .mode SHmedia + .section .text..SHmedia32, "ax" + .align 2 + + /* This function loads 64-bit general-purpose registers from the + stack, from a memory address contained in them or from an FP + register, according to a cookie passed in r1. Its execution + time is linear on the number of registers that actually have + to be copied. See sh.h for details on the actual bit pattern. + + The function to be called is passed in r0. If a 32-bit return + value is expected, the actual function will be tail-called, + otherwise the return address will be stored in r10 (that the + caller should expect to be clobbered) and the return value + will be expanded into r2/r3 upon return. */ + + .global GLOBAL(GCC_shcompact_call_trampoline) + FUNC(GLOBAL(GCC_shcompact_call_trampoline)) +GLOBAL(GCC_shcompact_call_trampoline): + ptabs/l r0, tr0 /* Prepare to call the actual function. */ + movi ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0 + pt/l LOCAL(ct_loop), tr1 + addz.l r1, r63, r1 + shori ((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0 +LOCAL(ct_loop): + nsb r1, r28 + shlli r28, 1, r29 + ldx.w r0, r29, r30 +LOCAL(ct_main_label): + ptrel/l r30, tr2 + blink tr2, r63 +LOCAL(ct_r2_fp): /* Copy r2 from an FP register. */ + /* It must be dr0, so just do it. */ + fmov.dq dr0, r2 + movi 7, r30 + shlli r30, 29, r31 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r3_fp): /* Copy r3 from an FP register. */ + /* It is either dr0 or dr2. */ + movi 7, r30 + shlri r1, 26, r32 + shlli r30, 26, r31 + andc r1, r31, r1 + fmov.dq dr0, r3 + beqi/l r32, 4, tr1 + fmov.dq dr2, r3 + blink tr1, r63 +LOCAL(ct_r4_fp): /* Copy r4 from an FP register. */ + shlri r1, 23 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32 +LOCAL(ct_r4_fp_base): + ptrel/l r32, tr2 + movi 7, r30 + shlli r30, 23, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r4_fp_copy): + fmov.dq dr0, r4 + blink tr1, r63 + fmov.dq dr2, r4 + blink tr1, r63 + fmov.dq dr4, r4 + blink tr1, r63 +LOCAL(ct_r5_fp): /* Copy r5 from an FP register. */ + shlri r1, 20 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32 +LOCAL(ct_r5_fp_base): + ptrel/l r32, tr2 + movi 7, r30 + shlli r30, 20, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r5_fp_copy): + fmov.dq dr0, r5 + blink tr1, r63 + fmov.dq dr2, r5 + blink tr1, r63 + fmov.dq dr4, r5 + blink tr1, r63 + fmov.dq dr6, r5 + blink tr1, r63 +LOCAL(ct_r6_fph): /* Copy r6 from a high FP register. */ + /* It must be dr8. */ + fmov.dq dr8, r6 + movi 15, r30 + shlli r30, 16, r31 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r6_fpl): /* Copy r6 from a low FP register. */ + shlri r1, 16 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32 +LOCAL(ct_r6_fp_base): + ptrel/l r32, tr2 + movi 7, r30 + shlli r30, 16, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r6_fp_copy): + fmov.dq dr0, r6 + blink tr1, r63 + fmov.dq dr2, r6 + blink tr1, r63 + fmov.dq dr4, r6 + blink tr1, r63 + fmov.dq dr6, r6 + blink tr1, r63 +LOCAL(ct_r7_fph): /* Copy r7 from a high FP register. */ + /* It is either dr8 or dr10. */ + movi 15 << 12, r31 + shlri r1, 12, r32 + andc r1, r31, r1 + fmov.dq dr8, r7 + beqi/l r32, 8, tr1 + fmov.dq dr10, r7 + blink tr1, r63 +LOCAL(ct_r7_fpl): /* Copy r7 from a low FP register. */ + shlri r1, 12 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32 +LOCAL(ct_r7_fp_base): + ptrel/l r32, tr2 + movi 7 << 12, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r7_fp_copy): + fmov.dq dr0, r7 + blink tr1, r63 + fmov.dq dr2, r7 + blink tr1, r63 + fmov.dq dr4, r7 + blink tr1, r63 + fmov.dq dr6, r7 + blink tr1, r63 +LOCAL(ct_r8_fph): /* Copy r8 from a high FP register. */ + /* It is either dr8 or dr10. */ + movi 15 << 8, r31 + andi r1, 1 << 8, r32 + andc r1, r31, r1 + fmov.dq dr8, r8 + beq/l r32, r63, tr1 + fmov.dq dr10, r8 + blink tr1, r63 +LOCAL(ct_r8_fpl): /* Copy r8 from a low FP register. */ + shlri r1, 8 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r8_fp_copy) - datalabel LOCAL(ct_r8_fp_base), r32 +LOCAL(ct_r8_fp_base): + ptrel/l r32, tr2 + movi 7 << 8, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r8_fp_copy): + fmov.dq dr0, r8 + blink tr1, r63 + fmov.dq dr2, r8 + blink tr1, r63 + fmov.dq dr4, r8 + blink tr1, r63 + fmov.dq dr6, r8 + blink tr1, r63 +LOCAL(ct_r9_fph): /* Copy r9 from a high FP register. */ + /* It is either dr8 or dr10. */ + movi 15 << 4, r31 + andi r1, 1 << 4, r32 + andc r1, r31, r1 + fmov.dq dr8, r9 + beq/l r32, r63, tr1 + fmov.dq dr10, r9 + blink tr1, r63 +LOCAL(ct_r9_fpl): /* Copy r9 from a low FP register. */ + shlri r1, 4 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r9_fp_copy) - datalabel LOCAL(ct_r9_fp_base), r32 +LOCAL(ct_r9_fp_base): + ptrel/l r32, tr2 + movi 7 << 4, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r9_fp_copy): + fmov.dq dr0, r9 + blink tr1, r63 + fmov.dq dr2, r9 + blink tr1, r63 + fmov.dq dr4, r9 + blink tr1, r63 + fmov.dq dr6, r9 + blink tr1, r63 +LOCAL(ct_r2_ld): /* Copy r2 from a memory address. */ + pt/l LOCAL(ct_r2_load), tr2 + movi 3, r30 + shlli r30, 29, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r2, 8, r3 + ldx.q r2, r63, r2 + /* Fall through. */ +LOCAL(ct_r3_ld): /* Copy r3 from a memory address. */ + pt/l LOCAL(ct_r3_load), tr2 + movi 3, r30 + shlli r30, 26, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r3, 8, r4 + ldx.q r3, r63, r3 +LOCAL(ct_r4_ld): /* Copy r4 from a memory address. */ + pt/l LOCAL(ct_r4_load), tr2 + movi 3, r30 + shlli r30, 23, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r4, 8, r5 + ldx.q r4, r63, r4 +LOCAL(ct_r5_ld): /* Copy r5 from a memory address. */ + pt/l LOCAL(ct_r5_load), tr2 + movi 3, r30 + shlli r30, 20, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r5, 8, r6 + ldx.q r5, r63, r5 +LOCAL(ct_r6_ld): /* Copy r6 from a memory address. */ + pt/l LOCAL(ct_r6_load), tr2 + movi 3 << 16, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r6, 8, r7 + ldx.q r6, r63, r6 +LOCAL(ct_r7_ld): /* Copy r7 from a memory address. */ + pt/l LOCAL(ct_r7_load), tr2 + movi 3 << 12, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r7, 8, r8 + ldx.q r7, r63, r7 +LOCAL(ct_r8_ld): /* Copy r8 from a memory address. */ + pt/l LOCAL(ct_r8_load), tr2 + movi 3 << 8, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r8, 8, r9 + ldx.q r8, r63, r8 +LOCAL(ct_r9_ld): /* Copy r9 from a memory address. */ + pt/l LOCAL(ct_check_tramp), tr2 + ldx.q r9, r63, r9 + blink tr2, r63 +LOCAL(ct_r2_load): + ldx.q r2, r63, r2 + blink tr1, r63 +LOCAL(ct_r3_load): + ldx.q r3, r63, r3 + blink tr1, r63 +LOCAL(ct_r4_load): + ldx.q r4, r63, r4 + blink tr1, r63 +LOCAL(ct_r5_load): + ldx.q r5, r63, r5 + blink tr1, r63 +LOCAL(ct_r6_load): + ldx.q r6, r63, r6 + blink tr1, r63 +LOCAL(ct_r7_load): + ldx.q r7, r63, r7 + blink tr1, r63 +LOCAL(ct_r8_load): + ldx.q r8, r63, r8 + blink tr1, r63 +LOCAL(ct_r2_pop): /* Pop r2 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r2 + shlli r30, 29, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r3_pop): /* Pop r3 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r3 + shlli r30, 26, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r4_pop): /* Pop r4 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r4 + shlli r30, 23, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r5_pop): /* Pop r5 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r5 + shlli r30, 20, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r6_pop): /* Pop r6 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r6 + shlli r30, 16, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r7_pop): /* Pop r7 from the stack. */ + ldx.q r15, r63, r7 + movi 1 << 12, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r8_pop): /* Pop r8 from the stack. */ + ldx.q r15, r63, r8 + movi 1 << 8, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_pop_seq): /* Pop a sequence of registers off the stack. */ + andi r1, 7 << 1, r30 + movi (LOCAL(ct_end_of_pop_seq) >> 16) & 65535, r32 + shlli r30, 2, r31 + shori LOCAL(ct_end_of_pop_seq) & 65535, r32 + sub.l r32, r31, r33 + ptabs/l r33, tr2 + blink tr2, r63 +LOCAL(ct_start_of_pop_seq): /* Beginning of pop sequence. */ + ldx.q r15, r63, r3 + addi.l r15, 8, r15 + ldx.q r15, r63, r4 + addi.l r15, 8, r15 + ldx.q r15, r63, r5 + addi.l r15, 8, r15 + ldx.q r15, r63, r6 + addi.l r15, 8, r15 + ldx.q r15, r63, r7 + addi.l r15, 8, r15 + ldx.q r15, r63, r8 + addi.l r15, 8, r15 +LOCAL(ct_r9_pop): /* Pop r9 from the stack. */ + ldx.q r15, r63, r9 + addi.l r15, 8, r15 +LOCAL(ct_end_of_pop_seq): /* Label used to compute first pop instruction. */ +LOCAL(ct_check_tramp): /* Check whether we need a trampoline. */ + pt/u LOCAL(ct_ret_wide), tr2 + andi r1, 1, r1 + bne/u r1, r63, tr2 +LOCAL(ct_call_func): /* Just branch to the function. */ + blink tr0, r63 +LOCAL(ct_ret_wide): /* Call the function, so that we can unpack its + 64-bit return value. */ + add.l r18, r63, r10 + blink tr0, r18 + ptabs r10, tr0 +#if __LITTLE_ENDIAN__ + shari r2, 32, r3 + add.l r2, r63, r2 +#else + add.l r2, r63, r3 + shari r2, 32, r2 +#endif + blink tr0, r63 + + ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline)) +#endif /* L_shcompact_call_trampoline */ + +#ifdef L_shcompact_return_trampoline + /* This function does the converse of the code in `ret_wide' + above. It is tail-called by SHcompact functions returning + 64-bit non-floating-point values, to pack the 32-bit values in + r2 and r3 into r2. */ + + .mode SHmedia + .section .text..SHmedia32, "ax" + .align 2 + .global GLOBAL(GCC_shcompact_return_trampoline) + HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline)) +GLOBAL(GCC_shcompact_return_trampoline): + ptabs/l r18, tr0 +#if __LITTLE_ENDIAN__ + addz.l r2, r63, r2 + shlli r3, 32, r3 +#else + addz.l r3, r63, r3 + shlli r2, 32, r2 +#endif + or r3, r2, r2 + blink tr0, r63 + + ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline)) +#endif /* L_shcompact_return_trampoline */ + +#ifdef L_shcompact_incoming_args + .section .rodata + .align 1 +LOCAL(ia_main_table): +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r2_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r2_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r3_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r3_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r4_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r4_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r5_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r5_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r6_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r6_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r7_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r7_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r8_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r8_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r9_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_return) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_return) - datalabel LOCAL(ia_main_label) + .mode SHmedia + .section .text..SHmedia32, "ax" + .align 2 + + /* This function stores 64-bit general-purpose registers back in + the stack, and loads the address in which each register + was stored into itself. The lower 32 bits of r17 hold the address + to begin storing, and the upper 32 bits of r17 hold the cookie. + Its execution time is linear on the + number of registers that actually have to be copied, and it is + optimized for structures larger than 64 bits, as opposed to + individual `long long' arguments. See sh.h for details on the + actual bit pattern. */ + + .global GLOBAL(GCC_shcompact_incoming_args) + FUNC(GLOBAL(GCC_shcompact_incoming_args)) +GLOBAL(GCC_shcompact_incoming_args): + ptabs/l r18, tr0 /* Prepare to return. */ + shlri r17, 32, r0 /* Load the cookie. */ + movi ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43 + pt/l LOCAL(ia_loop), tr1 + add.l r17, r63, r17 + shori ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43 +LOCAL(ia_loop): + nsb r0, r36 + shlli r36, 1, r37 + ldx.w r43, r37, r38 +LOCAL(ia_main_label): + ptrel/l r38, tr2 + blink tr2, r63 +LOCAL(ia_r2_ld): /* Store r2 and load its address. */ + movi 3, r38 + shlli r38, 29, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r2 + add.l r17, r63, r2 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r3_ld): /* Store r3 and load its address. */ + movi 3, r38 + shlli r38, 26, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r3 + add.l r17, r63, r3 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r4_ld): /* Store r4 and load its address. */ + movi 3, r38 + shlli r38, 23, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r4 + add.l r17, r63, r4 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r5_ld): /* Store r5 and load its address. */ + movi 3, r38 + shlli r38, 20, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r5 + add.l r17, r63, r5 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r6_ld): /* Store r6 and load its address. */ + movi 3, r38 + shlli r38, 16, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r6 + add.l r17, r63, r6 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r7_ld): /* Store r7 and load its address. */ + movi 3 << 12, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r7 + add.l r17, r63, r7 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r8_ld): /* Store r8 and load its address. */ + movi 3 << 8, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r8 + add.l r17, r63, r8 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r9_ld): /* Store r9 and load its address. */ + stx.q r17, r63, r9 + add.l r17, r63, r9 + blink tr0, r63 +LOCAL(ia_r2_push): /* Push r2 onto the stack. */ + movi 1, r38 + shlli r38, 29, r39 + andc r0, r39, r0 + stx.q r17, r63, r2 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r3_push): /* Push r3 onto the stack. */ + movi 1, r38 + shlli r38, 26, r39 + andc r0, r39, r0 + stx.q r17, r63, r3 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r4_push): /* Push r4 onto the stack. */ + movi 1, r38 + shlli r38, 23, r39 + andc r0, r39, r0 + stx.q r17, r63, r4 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r5_push): /* Push r5 onto the stack. */ + movi 1, r38 + shlli r38, 20, r39 + andc r0, r39, r0 + stx.q r17, r63, r5 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r6_push): /* Push r6 onto the stack. */ + movi 1, r38 + shlli r38, 16, r39 + andc r0, r39, r0 + stx.q r17, r63, r6 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r7_push): /* Push r7 onto the stack. */ + movi 1 << 12, r39 + andc r0, r39, r0 + stx.q r17, r63, r7 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r8_push): /* Push r8 onto the stack. */ + movi 1 << 8, r39 + andc r0, r39, r0 + stx.q r17, r63, r8 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_push_seq): /* Push a sequence of registers onto the stack. */ + andi r0, 7 << 1, r38 + movi (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40 + shlli r38, 2, r39 + shori LOCAL(ia_end_of_push_seq) & 65535, r40 + sub.l r40, r39, r41 + ptabs/l r41, tr2 + blink tr2, r63 +LOCAL(ia_stack_of_push_seq): /* Beginning of push sequence. */ + stx.q r17, r63, r3 + addi.l r17, 8, r17 + stx.q r17, r63, r4 + addi.l r17, 8, r17 + stx.q r17, r63, r5 + addi.l r17, 8, r17 + stx.q r17, r63, r6 + addi.l r17, 8, r17 + stx.q r17, r63, r7 + addi.l r17, 8, r17 + stx.q r17, r63, r8 + addi.l r17, 8, r17 +LOCAL(ia_r9_push): /* Push r9 onto the stack. */ + stx.q r17, r63, r9 +LOCAL(ia_return): /* Return. */ + blink tr0, r63 +LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction. */ + ENDFUNC(GLOBAL(GCC_shcompact_incoming_args)) +#endif /* L_shcompact_incoming_args */ +#endif +#if __SH5__ +#ifdef L_nested_trampoline +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif + .align 3 /* It is copied in units of 8 bytes in SHmedia mode. */ + .global GLOBAL(GCC_nested_trampoline) + HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline)) +GLOBAL(GCC_nested_trampoline): + .mode SHmedia + ptrel/u r63, tr0 + gettr tr0, r0 +#if __SH5__ == 64 + ld.q r0, 24, r1 +#else + ld.l r0, 24, r1 +#endif + ptabs/l r1, tr1 +#if __SH5__ == 64 + ld.q r0, 32, r1 +#else + ld.l r0, 28, r1 +#endif + blink tr1, r63 + + ENDFUNC(GLOBAL(GCC_nested_trampoline)) +#endif /* L_nested_trampoline */ +#endif /* __SH5__ */ +#if __SH5__ == 32 +#ifdef L_push_pop_shmedia_regs + .section .text..SHmedia32,"ax" + .mode SHmedia + .align 2 +#ifndef __SH4_NOFPU__ + .global GLOBAL(GCC_push_shmedia_regs) + FUNC(GLOBAL(GCC_push_shmedia_regs)) +GLOBAL(GCC_push_shmedia_regs): + addi.l r15, -14*8, r15 + fst.d r15, 13*8, dr62 + fst.d r15, 12*8, dr60 + fst.d r15, 11*8, dr58 + fst.d r15, 10*8, dr56 + fst.d r15, 9*8, dr54 + fst.d r15, 8*8, dr52 + fst.d r15, 7*8, dr50 + fst.d r15, 6*8, dr48 + fst.d r15, 5*8, dr46 + fst.d r15, 4*8, dr44 + fst.d r15, 3*8, dr42 + fst.d r15, 2*8, dr40 + fst.d r15, 1*8, dr38 + fst.d r15, 0*8, dr36 +#else /* ! __SH4_NOFPU__ */ + .global GLOBAL(GCC_push_shmedia_regs_nofpu) + FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu)) +GLOBAL(GCC_push_shmedia_regs_nofpu): +#endif /* ! __SH4_NOFPU__ */ + ptabs/l r18, tr0 + addi.l r15, -27*8, r15 + gettr tr7, r62 + gettr tr6, r61 + gettr tr5, r60 + st.q r15, 26*8, r62 + st.q r15, 25*8, r61 + st.q r15, 24*8, r60 + st.q r15, 23*8, r59 + st.q r15, 22*8, r58 + st.q r15, 21*8, r57 + st.q r15, 20*8, r56 + st.q r15, 19*8, r55 + st.q r15, 18*8, r54 + st.q r15, 17*8, r53 + st.q r15, 16*8, r52 + st.q r15, 15*8, r51 + st.q r15, 14*8, r50 + st.q r15, 13*8, r49 + st.q r15, 12*8, r48 + st.q r15, 11*8, r47 + st.q r15, 10*8, r46 + st.q r15, 9*8, r45 + st.q r15, 8*8, r44 + st.q r15, 7*8, r35 + st.q r15, 6*8, r34 + st.q r15, 5*8, r33 + st.q r15, 4*8, r32 + st.q r15, 3*8, r31 + st.q r15, 2*8, r30 + st.q r15, 1*8, r29 + st.q r15, 0*8, r28 + blink tr0, r63 +#ifndef __SH4_NOFPU__ + ENDFUNC(GLOBAL(GCC_push_shmedia_regs)) +#else + ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu)) +#endif +#ifndef __SH4_NOFPU__ + .global GLOBAL(GCC_pop_shmedia_regs) + FUNC(GLOBAL(GCC_pop_shmedia_regs)) +GLOBAL(GCC_pop_shmedia_regs): + pt .L0, tr1 + movi 41*8, r0 + fld.d r15, 40*8, dr62 + fld.d r15, 39*8, dr60 + fld.d r15, 38*8, dr58 + fld.d r15, 37*8, dr56 + fld.d r15, 36*8, dr54 + fld.d r15, 35*8, dr52 + fld.d r15, 34*8, dr50 + fld.d r15, 33*8, dr48 + fld.d r15, 32*8, dr46 + fld.d r15, 31*8, dr44 + fld.d r15, 30*8, dr42 + fld.d r15, 29*8, dr40 + fld.d r15, 28*8, dr38 + fld.d r15, 27*8, dr36 + blink tr1, r63 +#else /* ! __SH4_NOFPU__ */ + .global GLOBAL(GCC_pop_shmedia_regs_nofpu) + FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu)) +GLOBAL(GCC_pop_shmedia_regs_nofpu): +#endif /* ! __SH4_NOFPU__ */ + movi 27*8, r0 +.L0: + ptabs r18, tr0 + ld.q r15, 26*8, r62 + ld.q r15, 25*8, r61 + ld.q r15, 24*8, r60 + ptabs r62, tr7 + ptabs r61, tr6 + ptabs r60, tr5 + ld.q r15, 23*8, r59 + ld.q r15, 22*8, r58 + ld.q r15, 21*8, r57 + ld.q r15, 20*8, r56 + ld.q r15, 19*8, r55 + ld.q r15, 18*8, r54 + ld.q r15, 17*8, r53 + ld.q r15, 16*8, r52 + ld.q r15, 15*8, r51 + ld.q r15, 14*8, r50 + ld.q r15, 13*8, r49 + ld.q r15, 12*8, r48 + ld.q r15, 11*8, r47 + ld.q r15, 10*8, r46 + ld.q r15, 9*8, r45 + ld.q r15, 8*8, r44 + ld.q r15, 7*8, r35 + ld.q r15, 6*8, r34 + ld.q r15, 5*8, r33 + ld.q r15, 4*8, r32 + ld.q r15, 3*8, r31 + ld.q r15, 2*8, r30 + ld.q r15, 1*8, r29 + ld.q r15, 0*8, r28 + add.l r15, r0, r15 + blink tr0, r63 + +#ifndef __SH4_NOFPU__ + ENDFUNC(GLOBAL(GCC_pop_shmedia_regs)) +#else + ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu)) +#endif +#endif /* __SH5__ == 32 */ +#endif /* L_push_pop_shmedia_regs */ + +#ifdef L_div_table +#if __SH5__ +#if defined(__pic__) && defined(__SHMEDIA__) + .global GLOBAL(sdivsi3) + FUNC(GLOBAL(sdivsi3)) +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif +#if 0 +/* ??? FIXME: Presumably due to a linker bug, exporting data symbols + in a text section does not work (at least for shared libraries): + the linker sets the LSB of the address as if this was SHmedia code. */ +#define TEXT_DATA_BUG +#endif + .align 2 + // inputs: r4,r5 + // clobbered: r1,r18,r19,r20,r21,r25,tr0 + // result in r0 + .global GLOBAL(sdivsi3) +GLOBAL(sdivsi3): +#ifdef TEXT_DATA_BUG + ptb datalabel Local_div_table,tr0 +#else + ptb GLOBAL(div_table_internal),tr0 +#endif + nsb r5, r1 + shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62 + shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1) + /* bubble */ + gettr tr0,r20 + ldx.ub r20, r21, r19 // u0.8 + shari r25, 32, r25 // normalize to s2.30 + shlli r21, 1, r21 + muls.l r25, r19, r19 // s2.38 + ldx.w r20, r21, r21 // s2.14 + ptabs r18, tr0 + shari r19, 24, r19 // truncate to s2.14 + sub r21, r19, r19 // some 11 bit inverse in s1.14 + muls.l r19, r19, r21 // u0.28 + sub r63, r1, r1 + addi r1, 92, r1 + muls.l r25, r21, r18 // s2.58 + shlli r19, 45, r19 // multiply by two and convert to s2.58 + /* bubble */ + sub r19, r18, r18 + shari r18, 28, r18 // some 22 bit inverse in s1.30 + muls.l r18, r25, r0 // s2.60 + muls.l r18, r4, r25 // s32.30 + /* bubble */ + shari r0, 16, r19 // s-16.44 + muls.l r19, r18, r19 // s-16.74 + shari r25, 63, r0 + shari r4, 14, r18 // s19.-14 + shari r19, 30, r19 // s-16.44 + muls.l r19, r18, r19 // s15.30 + xor r21, r0, r21 // You could also use the constant 1 << 27. + add r21, r25, r21 + sub r21, r19, r21 + shard r21, r1, r21 + sub r21, r0, r0 + blink tr0, r63 + ENDFUNC(GLOBAL(sdivsi3)) +/* This table has been generated by divtab.c . +Defects for bias -330: + Max defect: 6.081536e-07 at -1.000000e+00 + Min defect: 2.849516e-08 at 1.030651e+00 + Max 2nd step defect: 9.606539e-12 at -1.000000e+00 + Min 2nd step defect: 0.000000e+00 at 0.000000e+00 + Defect at 1: 1.238659e-07 + Defect at -2: 1.061708e-07 */ +#else /* ! __pic__ || ! __SHMEDIA__ */ + .section .rodata +#endif /* __pic__ */ +#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__) + .balign 2 + .type Local_div_table,@object + .size Local_div_table,128 +/* negative division constants */ + .word -16638 + .word -17135 + .word -17737 + .word -18433 + .word -19103 + .word -19751 + .word -20583 + .word -21383 + .word -22343 + .word -23353 + .word -24407 + .word -25582 + .word -26863 + .word -28382 + .word -29965 + .word -31800 +/* negative division factors */ + .byte 66 + .byte 70 + .byte 75 + .byte 81 + .byte 87 + .byte 93 + .byte 101 + .byte 109 + .byte 119 + .byte 130 + .byte 142 + .byte 156 + .byte 172 + .byte 192 + .byte 214 + .byte 241 + .skip 16 +Local_div_table: + .skip 16 +/* positive division factors */ + .byte 241 + .byte 214 + .byte 192 + .byte 172 + .byte 156 + .byte 142 + .byte 130 + .byte 119 + .byte 109 + .byte 101 + .byte 93 + .byte 87 + .byte 81 + .byte 75 + .byte 70 + .byte 66 +/* positive division constants */ + .word 31801 + .word 29966 + .word 28383 + .word 26864 + .word 25583 + .word 24408 + .word 23354 + .word 22344 + .word 21384 + .word 20584 + .word 19752 + .word 19104 + .word 18434 + .word 17738 + .word 17136 + .word 16639 + .section .rodata +#endif /* TEXT_DATA_BUG */ + .balign 2 + .type GLOBAL(div_table),@object + .size GLOBAL(div_table),128 +/* negative division constants */ + .word -16638 + .word -17135 + .word -17737 + .word -18433 + .word -19103 + .word -19751 + .word -20583 + .word -21383 + .word -22343 + .word -23353 + .word -24407 + .word -25582 + .word -26863 + .word -28382 + .word -29965 + .word -31800 +/* negative division factors */ + .byte 66 + .byte 70 + .byte 75 + .byte 81 + .byte 87 + .byte 93 + .byte 101 + .byte 109 + .byte 119 + .byte 130 + .byte 142 + .byte 156 + .byte 172 + .byte 192 + .byte 214 + .byte 241 + .skip 16 + .global GLOBAL(div_table) +GLOBAL(div_table): + HIDDEN_ALIAS(div_table_internal,div_table) + .skip 16 +/* positive division factors */ + .byte 241 + .byte 214 + .byte 192 + .byte 172 + .byte 156 + .byte 142 + .byte 130 + .byte 119 + .byte 109 + .byte 101 + .byte 93 + .byte 87 + .byte 81 + .byte 75 + .byte 70 + .byte 66 +/* positive division constants */ + .word 31801 + .word 29966 + .word 28383 + .word 26864 + .word 25583 + .word 24408 + .word 23354 + .word 22344 + .word 21384 + .word 20584 + .word 19752 + .word 19104 + .word 18434 + .word 17738 + .word 17136 + .word 16639 + +#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__) +/* This code used shld, thus is not suitable for SH1 / SH2. */ + +/* Signed / unsigned division without use of FPU, optimized for SH4. + Uses a lookup table for divisors in the range -128 .. +128, and + div1 with case distinction for larger divisors in three more ranges. + The code is lumped together with the table to allow the use of mova. */ +#ifdef __LITTLE_ENDIAN__ +#define L_LSB 0 +#define L_LSWMSB 1 +#define L_MSWLSB 2 +#else +#define L_LSB 3 +#define L_LSWMSB 2 +#define L_MSWLSB 1 +#endif + + .balign 4 + .global GLOBAL(udivsi3_i4i) + FUNC(GLOBAL(udivsi3_i4i)) +GLOBAL(udivsi3_i4i): + mov.w LOCAL(c128_w), r1 + div0u + mov r4,r0 + shlr8 r0 + cmp/hi r1,r5 + extu.w r5,r1 + bf LOCAL(udiv_le128) + cmp/eq r5,r1 + bf LOCAL(udiv_ge64k) + shlr r0 + mov r5,r1 + shll16 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 + div1 r5,r0 + div1 r5,r0 + bra LOCAL(udiv_25) + div1 r5,r0 + +LOCAL(div_le128): + mova LOCAL(div_table_ix),r0 + bra LOCAL(div_le128_2) + mov.b @(r0,r5),r1 +LOCAL(udiv_le128): + mov.l r4,@-r15 + mova LOCAL(div_table_ix),r0 + mov.b @(r0,r5),r1 + mov.l r5,@-r15 +LOCAL(div_le128_2): + mova LOCAL(div_table_inv),r0 + mov.l @(r0,r1),r1 + mov r5,r0 + tst #0xfe,r0 + mova LOCAL(div_table_clz),r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + bt/s LOCAL(div_by_1) + mov r4,r0 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + rts + shld r1,r0 + +LOCAL(div_by_1_neg): + neg r4,r0 +LOCAL(div_by_1): + mov.l @r15+,r5 + rts + mov.l @r15+,r4 + +LOCAL(div_ge64k): + bt/s LOCAL(div_r8) + div0u + shll8 r5 + bra LOCAL(div_ge64k_2) + div1 r5,r0 +LOCAL(udiv_ge64k): + cmp/hi r0,r5 + mov r5,r1 + bt LOCAL(udiv_r8) + shll8 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 +LOCAL(div_ge64k_2): + div1 r5,r0 + mov.l LOCAL(zero_l),r1 + .rept 4 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w LOCAL(m256_w),r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra LOCAL(div_ge64k_end) + xor r4,r0 + +LOCAL(div_r8): + shll16 r4 + bra LOCAL(div_r8_2) + shll8 r4 +LOCAL(udiv_r8): + mov.l r4,@-r15 + shll16 r4 + clrt + shll8 r4 + mov.l r5,@-r15 +LOCAL(div_r8_2): + rotcl r4 + mov r0,r1 + div1 r5,r1 + mov r4,r0 + rotcl r0 + mov r5,r4 + div1 r5,r1 + .rept 5 + rotcl r0; div1 r5,r1 + .endr + rotcl r0 + mov.l @r15+,r5 + div1 r4,r1 + mov.l @r15+,r4 + rts + rotcl r0 + + ENDFUNC(GLOBAL(udivsi3_i4i)) + + .global GLOBAL(sdivsi3_i4i) + FUNC(GLOBAL(sdivsi3_i4i)) + /* This is link-compatible with a GLOBAL(sdivsi3) call, + but we effectively clobber only r1. */ +GLOBAL(sdivsi3_i4i): + mov.l r4,@-r15 + cmp/pz r5 + mov.w LOCAL(c128_w), r1 + bt/s LOCAL(pos_divisor) + cmp/pz r4 + mov.l r5,@-r15 + neg r5,r5 + bt/s LOCAL(neg_result) + cmp/hi r1,r5 + neg r4,r4 +LOCAL(pos_result): + extu.w r5,r0 + bf LOCAL(div_le128) + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s LOCAL(div_ge64k) + cmp/hi r0,r5 + div0u + shll16 r5 + div1 r5,r0 + div1 r5,r0 + div1 r5,r0 +LOCAL(udiv_25): + mov.l LOCAL(zero_l),r1 + div1 r5,r0 + div1 r5,r0 + mov.l r1,@-r15 + .rept 3 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +LOCAL(div_ge64k_end): + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r0 + mov.l @r15+,r5 + or r4,r0 + mov.l @r15+,r4 + rts + rotcl r0 + +LOCAL(div_le128_neg): + tst #0xfe,r0 + mova LOCAL(div_table_ix),r0 + mov.b @(r0,r5),r1 + mova LOCAL(div_table_inv),r0 + bt/s LOCAL(div_by_1_neg) + mov.l @(r0,r1),r1 + mova LOCAL(div_table_clz),r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + shld r1,r0 + rts + neg r0,r0 + +LOCAL(pos_divisor): + mov.l r5,@-r15 + bt/s LOCAL(pos_result) + cmp/hi r1,r5 + neg r4,r4 +LOCAL(neg_result): + extu.w r5,r0 + bf LOCAL(div_le128_neg) + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s LOCAL(div_ge64k_neg) + cmp/hi r0,r5 + div0u + mov.l LOCAL(zero_l),r1 + shll16 r5 + div1 r5,r0 + mov.l r1,@-r15 + .rept 7 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +LOCAL(div_ge64k_neg_end): + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r1 + mov.l @r15+,r5 + or r4,r1 +LOCAL(div_r8_neg_end): + mov.l @r15+,r4 + rotcl r1 + rts + neg r1,r0 + +LOCAL(div_ge64k_neg): + bt/s LOCAL(div_r8_neg) + div0u + shll8 r5 + mov.l LOCAL(zero_l),r1 + .rept 6 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w LOCAL(m256_w),r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra LOCAL(div_ge64k_neg_end) + xor r4,r0 + +LOCAL(c128_w): + .word 128 + +LOCAL(div_r8_neg): + clrt + shll16 r4 + mov r4,r1 + shll8 r1 + mov r5,r4 + .rept 7 + rotcl r1; div1 r5,r0 + .endr + mov.l @r15+,r5 + rotcl r1 + bra LOCAL(div_r8_neg_end) + div1 r4,r0 + +LOCAL(m256_w): + .word 0xff00 +/* This table has been generated by divtab-sh4.c. */ + .balign 4 +LOCAL(div_table_clz): + .byte 0 + .byte 1 + .byte 0 + .byte -1 + .byte -1 + .byte -2 + .byte -2 + .byte -2 + .byte -2 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 +/* Lookup table translating positive divisor to index into table of + normalized inverse. N.B. the '0' entry is also the last entry of the + previous table, and causes an unaligned access for division by zero. */ +LOCAL(div_table_ix): + .byte -6 + .byte -128 + .byte -128 + .byte 0 + .byte -128 + .byte -64 + .byte 0 + .byte 64 + .byte -128 + .byte -96 + .byte -64 + .byte -32 + .byte 0 + .byte 32 + .byte 64 + .byte 96 + .byte -128 + .byte -112 + .byte -96 + .byte -80 + .byte -64 + .byte -48 + .byte -32 + .byte -16 + .byte 0 + .byte 16 + .byte 32 + .byte 48 + .byte 64 + .byte 80 + .byte 96 + .byte 112 + .byte -128 + .byte -120 + .byte -112 + .byte -104 + .byte -96 + .byte -88 + .byte -80 + .byte -72 + .byte -64 + .byte -56 + .byte -48 + .byte -40 + .byte -32 + .byte -24 + .byte -16 + .byte -8 + .byte 0 + .byte 8 + .byte 16 + .byte 24 + .byte 32 + .byte 40 + .byte 48 + .byte 56 + .byte 64 + .byte 72 + .byte 80 + .byte 88 + .byte 96 + .byte 104 + .byte 112 + .byte 120 + .byte -128 + .byte -124 + .byte -120 + .byte -116 + .byte -112 + .byte -108 + .byte -104 + .byte -100 + .byte -96 + .byte -92 + .byte -88 + .byte -84 + .byte -80 + .byte -76 + .byte -72 + .byte -68 + .byte -64 + .byte -60 + .byte -56 + .byte -52 + .byte -48 + .byte -44 + .byte -40 + .byte -36 + .byte -32 + .byte -28 + .byte -24 + .byte -20 + .byte -16 + .byte -12 + .byte -8 + .byte -4 + .byte 0 + .byte 4 + .byte 8 + .byte 12 + .byte 16 + .byte 20 + .byte 24 + .byte 28 + .byte 32 + .byte 36 + .byte 40 + .byte 44 + .byte 48 + .byte 52 + .byte 56 + .byte 60 + .byte 64 + .byte 68 + .byte 72 + .byte 76 + .byte 80 + .byte 84 + .byte 88 + .byte 92 + .byte 96 + .byte 100 + .byte 104 + .byte 108 + .byte 112 + .byte 116 + .byte 120 + .byte 124 + .byte -128 +/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */ + .balign 4 +LOCAL(zero_l): + .long 0x0 + .long 0xF81F81F9 + .long 0xF07C1F08 + .long 0xE9131AC0 + .long 0xE1E1E1E2 + .long 0xDAE6076C + .long 0xD41D41D5 + .long 0xCD856891 + .long 0xC71C71C8 + .long 0xC0E07039 + .long 0xBACF914D + .long 0xB4E81B4F + .long 0xAF286BCB + .long 0xA98EF607 + .long 0xA41A41A5 + .long 0x9EC8E952 + .long 0x9999999A + .long 0x948B0FCE + .long 0x8F9C18FA + .long 0x8ACB90F7 + .long 0x86186187 + .long 0x81818182 + .long 0x7D05F418 + .long 0x78A4C818 + .long 0x745D1746 + .long 0x702E05C1 + .long 0x6C16C16D + .long 0x68168169 + .long 0x642C8591 + .long 0x60581606 + .long 0x5C9882BA + .long 0x58ED2309 +LOCAL(div_table_inv): + .long 0x55555556 + .long 0x51D07EAF + .long 0x4E5E0A73 + .long 0x4AFD6A06 + .long 0x47AE147B + .long 0x446F8657 + .long 0x41414142 + .long 0x3E22CBCF + .long 0x3B13B13C + .long 0x38138139 + .long 0x3521CFB3 + .long 0x323E34A3 + .long 0x2F684BDB + .long 0x2C9FB4D9 + .long 0x29E4129F + .long 0x27350B89 + .long 0x24924925 + .long 0x21FB7813 + .long 0x1F7047DD + .long 0x1CF06ADB + .long 0x1A7B9612 + .long 0x18118119 + .long 0x15B1E5F8 + .long 0x135C8114 + .long 0x11111112 + .long 0xECF56BF + .long 0xC9714FC + .long 0xA6810A7 + .long 0x8421085 + .long 0x624DD30 + .long 0x4104105 + .long 0x2040811 + /* maximum error: 0.987342 scaled: 0.921875*/ + + ENDFUNC(GLOBAL(sdivsi3_i4i)) +#endif /* SH3 / SH4 */ + +#endif /* L_div_table */ + +#ifdef L_udiv_qrnnd_16 +#if !__SHMEDIA__ + HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16)) + /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */ + /* n1 < d, but n1 might be larger than d1. */ + .global GLOBAL(udiv_qrnnd_16) + .balign 8 +GLOBAL(udiv_qrnnd_16): + div0u + cmp/hi r6,r0 + bt .Lots + .rept 16 + div1 r6,r0 + .endr + extu.w r0,r1 + bt 0f + add r6,r0 +0: rotcl r1 + mulu.w r1,r5 + xtrct r4,r0 + swap.w r0,r0 + sts macl,r2 + cmp/hs r2,r0 + sub r2,r0 + bt 0f + addc r5,r0 + add #-1,r1 + bt 0f +1: add #-1,r1 + rts + add r5,r0 + .balign 8 +.Lots: + sub r5,r0 + swap.w r4,r1 + xtrct r0,r1 + clrt + mov r1,r0 + addc r5,r0 + mov #-1,r1 + SL1(bf, 1b, + shlr16 r1) +0: rts + nop + ENDFUNC(GLOBAL(udiv_qrnnd_16)) +#endif /* !__SHMEDIA__ */ +#endif /* L_udiv_qrnnd_16 */ diff --git a/libgcc/config/sh/lib1funcs.h b/libgcc/config/sh/lib1funcs.h new file mode 100644 index 00000000000..af4b41cc314 --- /dev/null +++ b/libgcc/config/sh/lib1funcs.h @@ -0,0 +1,76 @@ +/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006, 2009 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#ifdef __ELF__ +#define LOCAL(X) .L_##X +#define FUNC(X) .type X,@function +#define HIDDEN_FUNC(X) FUNC(X); .hidden X +#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y); .hidden GLOBAL(X) +#define ENDFUNC0(X) .Lfe_##X: .size X,.Lfe_##X-X +#define ENDFUNC(X) ENDFUNC0(X) +#else +#define LOCAL(X) L_##X +#define FUNC(X) +#define HIDDEN_FUNC(X) +#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y) +#define ENDFUNC(X) +#endif + +#define CONCAT(A,B) A##B +#define GLOBAL0(U,X) CONCAT(U,__##X) +#define GLOBAL(X) GLOBAL0(__USER_LABEL_PREFIX__,X) + +#define ALIAS(X,Y) .global GLOBAL(X); .set GLOBAL(X),GLOBAL(Y) + +#if defined __SH2A__ && defined __FMOVD_ENABLED__ +#undef FMOVD_WORKS +#define FMOVD_WORKS +#endif + +#ifdef __LITTLE_ENDIAN__ +#define DR00 fr1 +#define DR01 fr0 +#define DR20 fr3 +#define DR21 fr2 +#define DR40 fr5 +#define DR41 fr4 +#else /* !__LITTLE_ENDIAN__ */ +#define DR00 fr0 +#define DR01 fr1 +#define DR20 fr2 +#define DR21 fr3 +#define DR40 fr4 +#define DR41 fr5 +#endif /* !__LITTLE_ENDIAN__ */ + +#ifdef __sh1__ +#define SL(branch, dest, in_slot, in_slot_arg2) \ + in_slot, in_slot_arg2; branch dest +#define SL1(branch, dest, in_slot) \ + in_slot; branch dest +#else /* ! __sh1__ */ +#define SL(branch, dest, in_slot, in_slot_arg2) \ + branch##.s dest; in_slot, in_slot_arg2 +#define SL1(branch, dest, in_slot) \ + branch##/s dest; in_slot +#endif /* !__sh1__ */ diff --git a/libgcc/config/sh/t-linux b/libgcc/config/sh/t-linux index af618e260c6..9b1feacd1f3 100644 --- a/libgcc/config/sh/t-linux +++ b/libgcc/config/sh/t-linux @@ -1,3 +1,5 @@ +LIB1ASMFUNCS_CACHE = _ic_invalidate _ic_invalidate_array + HOST_LIBGCC2_CFLAGS = -fpic -mieee -DNO_FPSCR_VALUES # Override t-slibgcc-elf-ver to export some libgcc symbols with diff --git a/libgcc/config/sh/t-netbsd b/libgcc/config/sh/t-netbsd new file mode 100644 index 00000000000..663edbf4187 --- /dev/null +++ b/libgcc/config/sh/t-netbsd @@ -0,0 +1 @@ +LIB1ASMFUNCS_CACHE = _ic_invalidate diff --git a/libgcc/config/sh/t-sh b/libgcc/config/sh/t-sh index ab4d98089b1..2319adbef1d 100644 --- a/libgcc/config/sh/t-sh +++ b/libgcc/config/sh/t-sh @@ -17,26 +17,33 @@ # along with GCC; see the file COPYING3. If not see # <http://www.gnu.org/licenses/>. +LIB1ASMSRC = sh/lib1funcs.S +LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \ + _movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \ + _div_table _udiv_qrnnd_16 \ + $(LIB1ASMFUNCS_CACHE) +LIB1ASMFUNCS_CACHE = _ic_invalidate _ic_invalidate_array + crt1.o: $(srcdir)/config/sh/crt1.S $(gcc_compile) -c $< -ic_invalidate_array_4-100.o: $(gcc_srcdir)/config/sh/lib1funcs.asm +ic_invalidate_array_4-100.o: $(srcdir)/config/sh/lib1funcs.S $(gcc_compile) -c -DL_ic_invalidate_array -DWAYS=1 -DWAY_SIZE=0x2000 $< libic_invalidate_array_4-100.a: ic_invalidate_array_4-100.o $(AR_CREATE_FOR_TARGET) $@ $< -ic_invalidate_array_4-200.o: $(gcc_srcdir)/config/sh/lib1funcs.asm +ic_invalidate_array_4-200.o: $(srcdir)/config/sh/lib1funcs.S $(gcc_compile) -c -DL_ic_invalidate_array -DWAYS=2 -DWAY_SIZE=0x2000 $< libic_invalidate_array_4-200.a: ic_invalidate_array_4-200.o $(AR_CREATE_FOR_TARGET) $@ $< -ic_invalidate_array_4a.o: $(gcc_srcdir)/config/sh/lib1funcs.asm +ic_invalidate_array_4a.o: $(srcdir)/config/sh/lib1funcs.S $(gcc_compile) -c -DL_ic_invalidate_array -D__FORCE_SH4A__ $< libic_invalidate_array_4a.a: ic_invalidate_array_4a.o $(AR_CREATE_FOR_TARGET) $@ $< sdivsi3_i4i-Os-4-200.o: $(srcdir)/config/sh/lib1funcs-Os-4-200.S - $(gcc_compile) -c -DL_sdivsi3_i4i $< + $(compile) -c -DL_sdivsi3_i4i $< udivsi3_i4i-Os-4-200.o: $(srcdir)/config/sh/lib1funcs-Os-4-200.S $(gcc_compile) -c -DL_udivsi3_i4i $< unwind-dw2-Os-4-200.o: $(gcc_srcdir)/unwind-dw2.c diff --git a/libgcc/config/sh/t-sh64 b/libgcc/config/sh/t-sh64 new file mode 100644 index 00000000000..fa9950e03b2 --- /dev/null +++ b/libgcc/config/sh/t-sh64 @@ -0,0 +1,6 @@ +LIB1ASMFUNCS = \ + _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \ + _shcompact_call_trampoline _shcompact_return_trampoline \ + _shcompact_incoming_args _ic_invalidate _nested_trampoline \ + _push_pop_shmedia_regs \ + _udivdi3 _divdi3 _umoddi3 _moddi3 _div_table |